Skip to content

Commit ad4cd6f

Browse files
ti-chi-botRidRisR
andauthored
backup: update Backup CR status to Failed when log backup task not found (#6630) (#6637)
Co-authored-by: RidRisR <[email protected]>
1 parent 2ae79ca commit ad4cd6f

File tree

3 files changed

+211
-1
lines changed

3 files changed

+211
-1
lines changed

pkg/backup/backup/backup_tracker.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,15 @@ func (bt *backupTracker) SyncLogBackupState(backup *v1alpha1.Backup) (bool, erro
453453
// After stop, key not existing is normal
454454
return true, nil
455455
}
456-
// For other commands, key not existing is an error
456+
// For other commands, key not existing is an error - update status to Failed
457+
klog.Errorf("log backup %s/%s info key not found in etcd, marking as failed", ns, name)
458+
bt.statusUpdater.Update(backup, &v1alpha1.BackupCondition{
459+
Command: command,
460+
Type: v1alpha1.BackupFailed,
461+
Status: corev1.ConditionTrue,
462+
Reason: "LogBackupTaskNotFound",
463+
Message: "Log backup task not found in TiKV, the task may have been deleted or cluster state is inconsistent",
464+
}, nil)
457465
return false, fmt.Errorf("log backup key not found")
458466
}
459467

tests/e2e/br/br.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,120 @@ var _ = ginkgo.Describe("Backup and Restore", func() {
770770
// k8se2e.ExpectNoError(err)
771771
// k8se2e.ExpectEqual(cleaned, true, "storage should be cleaned")
772772
// })
773+
774+
ginkgo.It("log backup task deleted externally should fail with LogBackupTaskNotFound", func() {
775+
backupClusterName := "log-backup-not-found"
776+
backupVersion := utilimage.TiDBLatest
777+
enableTLS := false
778+
skipCA := false
779+
backupName := backupClusterName
780+
typ := strings.ToLower(typeBR)
781+
782+
ns := f.Namespace.Name
783+
ctx, cancel := context.WithCancel(context.Background())
784+
defer cancel()
785+
786+
ginkgo.By("Create log-backup.enable TiDB cluster")
787+
err := createLogBackupEnableTidbCluster(f, backupClusterName, backupVersion, enableTLS, skipCA)
788+
framework.ExpectNoError(err)
789+
790+
ginkgo.By("Wait for backup TiDB cluster ready")
791+
err = utiltidbcluster.WaitForTCConditionReady(f.ExtClient, ns, backupClusterName, tidbReadyTimeout, 0)
792+
framework.ExpectNoError(err)
793+
794+
ginkgo.By("Create RBAC for log backup")
795+
err = createRBAC(f)
796+
framework.ExpectNoError(err)
797+
798+
ginkgo.By("Start log backup")
799+
backup, err := createBackupAndWaitForComplete(f, backupName, backupClusterName, typ, func(backup *v1alpha1.Backup) {
800+
backup.Spec.CleanPolicy = v1alpha1.CleanPolicyTypeDelete
801+
backup.Spec.Mode = v1alpha1.BackupModeLog
802+
})
803+
framework.ExpectNoError(err)
804+
framework.ExpectNotEqual(backup.Status.CommitTs, "")
805+
framework.ExpectEqual(backup.Status.Phase, v1alpha1.BackupRunning)
806+
807+
ginkgo.By("Stop log backup task via BR command (bypass operator)")
808+
err = stopLogBackupTaskViaBR(f, backupClusterName, backupName)
809+
framework.ExpectNoError(err)
810+
811+
ginkgo.By("Wait for backup to fail with LogBackupTaskNotFound")
812+
err = brutil.WaitForLogBackupFailedWithReason(f.ExtClient, ns, backupName, "LogBackupTaskNotFound", backupCompleteTimeout)
813+
framework.ExpectNoError(err)
814+
815+
ginkgo.By("Verify backup can be deleted")
816+
err = deleteBackup(f, backupName)
817+
framework.ExpectNoError(err)
818+
819+
ginkgo.By("Check if all backup files in storage is deleted")
820+
cleaned, err := f.Storage.IsDataCleaned(ctx, ns, backup.Spec.S3.Prefix)
821+
framework.ExpectNoError(err)
822+
framework.ExpectEqual(cleaned, true, "storage should be cleaned")
823+
})
824+
825+
ginkgo.It("log backup should wait and start when cluster becomes ready", func() {
826+
backupClusterName := "log-backup-wait-cluster"
827+
backupVersion := utilimage.TiDBLatest
828+
enableTLS := false
829+
skipCA := false
830+
backupName := backupClusterName
831+
typ := strings.ToLower(typeBR)
832+
833+
ns := f.Namespace.Name
834+
ctx, cancel := context.WithCancel(context.Background())
835+
defer cancel()
836+
837+
ginkgo.By("Create RBAC for log backup first")
838+
err := createRBAC(f)
839+
framework.ExpectNoError(err)
840+
841+
ginkgo.By("Create log backup CR before cluster exists")
842+
backupFolder := time.Now().Format(time.RFC3339)
843+
cfg := f.Storage.Config(ns, backupFolder)
844+
s := brutil.GetSecret(ns, backupName, "")
845+
_, err = f.ClientSet.CoreV1().Secrets(ns).Create(context.TODO(), s, metav1.CreateOptions{})
846+
framework.ExpectNoError(err)
847+
848+
backup := brutil.GetBackup(ns, backupName, backupClusterName, typ, cfg)
849+
backup.Spec.CleanPolicy = v1alpha1.CleanPolicyTypeDelete
850+
backup.Spec.Mode = v1alpha1.BackupModeLog
851+
_, err = f.ExtClient.PingcapV1alpha1().Backups(ns).Create(context.TODO(), backup, metav1.CreateOptions{})
852+
framework.ExpectNoError(err)
853+
854+
ginkgo.By("Verify backup is waiting (cluster not found)")
855+
// Wait a few seconds to let controller attempt reconcile
856+
time.Sleep(10 * time.Second)
857+
_, err = f.ExtClient.PingcapV1alpha1().Backups(ns).Get(context.TODO(), backupName, metav1.GetOptions{})
858+
framework.ExpectNoError(err)
859+
860+
ginkgo.By("Create log-backup.enable TiDB cluster")
861+
err = createLogBackupEnableTidbCluster(f, backupClusterName, backupVersion, enableTLS, skipCA)
862+
framework.ExpectNoError(err)
863+
864+
ginkgo.By("Wait for TiDB cluster ready")
865+
err = utiltidbcluster.WaitForTCConditionReady(f.ExtClient, ns, backupClusterName, tidbReadyTimeout, 0)
866+
framework.ExpectNoError(err)
867+
868+
ginkgo.By("Wait for backup to become Running")
869+
err = brutil.WaitForBackupComplete(f.ExtClient, ns, backupName, backupCompleteTimeout)
870+
framework.ExpectNoError(err)
871+
872+
ginkgo.By("Verify CommitTs is set")
873+
backup, err = f.ExtClient.PingcapV1alpha1().Backups(ns).Get(context.TODO(), backupName, metav1.GetOptions{})
874+
framework.ExpectNoError(err)
875+
framework.ExpectNotEqual(backup.Status.CommitTs, "", "CommitTs should be set")
876+
framework.ExpectEqual(backup.Status.Phase, v1alpha1.BackupRunning)
877+
878+
ginkgo.By("Delete backup")
879+
err = deleteBackup(f, backupName)
880+
framework.ExpectNoError(err)
881+
882+
ginkgo.By("Check if all backup files in storage is deleted")
883+
cleaned, err := f.Storage.IsDataCleaned(ctx, ns, backup.Spec.S3.Prefix)
884+
framework.ExpectNoError(err)
885+
framework.ExpectEqual(cleaned, true, "storage should be cleaned")
886+
})
773887
})
774888

775889
// the following cases may encounter errors after restarting the backup pod:
@@ -1836,3 +1950,57 @@ func createCompactBackupAndWaitForComplete(f *e2eframework.Framework, name, tcNa
18361950
}
18371951
return f.ExtClient.PingcapV1alpha1().CompactBackups(ns).Get(context.TODO(), name, metav1.GetOptions{})
18381952
}
1953+
1954+
// stopLogBackupTaskViaBR stops a log backup task directly via BR command, bypassing the operator.
1955+
// This is used to simulate external deletion of the log backup task in TiKV.
1956+
func stopLogBackupTaskViaBR(f *e2eframework.Framework, clusterName, taskName string) error {
1957+
ns := f.Namespace.Name
1958+
pdAddr := fmt.Sprintf("%s-pd:2379", clusterName)
1959+
1960+
pod := &v1.Pod{
1961+
ObjectMeta: metav1.ObjectMeta{
1962+
Name: "br-cli-" + taskName,
1963+
Namespace: ns,
1964+
},
1965+
Spec: v1.PodSpec{
1966+
RestartPolicy: v1.RestartPolicyNever,
1967+
Containers: []v1.Container{
1968+
{
1969+
Name: "br",
1970+
Image: fmt.Sprintf("pingcap/br:%s", utilimage.TiDBLatest),
1971+
Command: []string{"/br", "log", "stop", "--task-name=" + taskName, "--pd=" + pdAddr},
1972+
},
1973+
},
1974+
},
1975+
}
1976+
1977+
// Create the pod
1978+
_, err := f.ClientSet.CoreV1().Pods(ns).Create(context.TODO(), pod, metav1.CreateOptions{})
1979+
if err != nil {
1980+
return fmt.Errorf("failed to create BR CLI pod: %v", err)
1981+
}
1982+
1983+
// Wait for pod to complete
1984+
err = wait.PollImmediate(time.Second*2, time.Minute*5, func() (bool, error) {
1985+
p, err := f.ClientSet.CoreV1().Pods(ns).Get(context.TODO(), pod.Name, metav1.GetOptions{})
1986+
if err != nil {
1987+
return false, err
1988+
}
1989+
if p.Status.Phase == v1.PodSucceeded {
1990+
return true, nil
1991+
}
1992+
if p.Status.Phase == v1.PodFailed {
1993+
return false, fmt.Errorf("BR CLI pod failed")
1994+
}
1995+
return false, nil
1996+
})
1997+
1998+
// Clean up pod regardless of success/failure
1999+
deleteErr := f.ClientSet.CoreV1().Pods(ns).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})
2000+
if deleteErr != nil {
2001+
// Log but don't fail on cleanup error
2002+
fmt.Printf("Warning: failed to delete BR CLI pod: %v\n", deleteErr)
2003+
}
2004+
2005+
return err
2006+
}

tests/e2e/br/framework/br/wait.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,40 @@ func WaitForBackupFailed(c versioned.Interface, ns, name string, timeout time.Du
181181
return nil
182182
}
183183

184+
// WaitForLogBackupFailedWithReason will poll and wait until timeout or log backup fails with expected reason
185+
func WaitForLogBackupFailedWithReason(c versioned.Interface, ns, name, expectedReason string, timeout time.Duration) error {
186+
if err := wait.PollImmediate(poll, timeout, func() (bool, error) {
187+
b, err := c.PingcapV1alpha1().Backups(ns).Get(context.TODO(), name, metav1.GetOptions{})
188+
if err != nil {
189+
return false, err
190+
}
191+
192+
// Check log backup subcommand status for failure
193+
if b.Spec.Mode == v1alpha1.BackupModeLog {
194+
if v1alpha1.IsLogBackupSubCommandOntheCondition(b, v1alpha1.BackupFailed) {
195+
reason, _ := v1alpha1.GetLogSubcommandConditionInfo(b)
196+
if reason == expectedReason {
197+
return true, nil
198+
}
199+
return false, fmt.Errorf("log backup failed with unexpected reason: %s, expected: %s", reason, expectedReason)
200+
}
201+
}
202+
203+
// Also check top-level conditions
204+
for _, cond := range b.Status.Conditions {
205+
if cond.Type == v1alpha1.BackupFailed && cond.Status == corev1.ConditionTrue {
206+
if cond.Reason == expectedReason {
207+
return true, nil
208+
}
209+
}
210+
}
211+
return false, nil
212+
}); err != nil {
213+
return fmt.Errorf("can't wait for log backup failed with reason %s: %v", expectedReason, err)
214+
}
215+
return nil
216+
}
217+
184218
// WaitForRestoreComplete will poll and wait until timeout or restore complete condition is true
185219
func WaitForRestoreComplete(c versioned.Interface, ns, name string, timeout time.Duration) error {
186220
if err := wait.PollImmediate(poll, timeout, func() (bool, error) {

0 commit comments

Comments
 (0)