Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
cluster-etcd-operator
tnf-setup-runner
tnf-runtime

# Log output from telepresence
telepresence.log
Expand All @@ -27,4 +27,4 @@ report.json
test-results/

# OpenShift Tests Extension metadata
.openshift-tests-extension/openshift_payload_*.json
.openshift-tests-extension/openshift_payload_*.json
2 changes: 1 addition & 1 deletion Dockerfile.ocp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ FROM registry.ci.openshift.org/ocp/4.21:base-rhel9
COPY --from=builder /go/src/github.com/openshift/cluster-etcd-operator/bindata/bootkube/bootstrap-manifests /usr/share/bootkube/manifests/bootstrap-manifests/
COPY --from=builder /go/src/github.com/openshift/cluster-etcd-operator/bindata/bootkube/manifests /usr/share/bootkube/manifests/manifests/
COPY --from=builder /go/src/github.com/openshift/cluster-etcd-operator/cluster-etcd-operator /usr/bin/
COPY --from=builder /go/src/github.com/openshift/cluster-etcd-operator/tnf-setup-runner /usr/bin/
COPY --from=builder /go/src/github.com/openshift/cluster-etcd-operator/tnf-runtime /usr/bin/
COPY --from=builder /go/src/github.com/openshift/cluster-etcd-operator/cluster-etcd-operator-tests-ext.gz /usr/bin/
COPY manifests/ /manifests

Expand Down
19 changes: 19 additions & 0 deletions bindata/tnfdeployment/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,25 @@ rules:
- get
- patch
- update
- apiGroups:
- etcd.openshift.io
resources:
- pacemakerstatuses
verbs:
- get
- list
- watch
- create
- update
- patch
- apiGroups:
- etcd.openshift.io
resources:
- pacemakerstatuses/status
verbs:
- get
- update
- patch
- apiGroups:
- config.openshift.io
resources:
Expand Down
48 changes: 48 additions & 0 deletions bindata/tnfdeployment/cronjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: <injected>
namespace: <injected>
labels:
app.kubernetes.io/name: <injected>
spec:
schedule: <injected>
concurrencyPolicy: "Forbid"
failedJobsHistoryLimit: 3
successfulJobsHistoryLimit: 1
jobTemplate:
metadata:
labels:
app.kubernetes.io/name: <injected>
spec:
ttlSecondsAfterFinished: 600
template:
metadata:
annotations:
openshift.io/required-scc: "privileged"
labels:
app.kubernetes.io/name: <injected>
spec:
containers:
- name: collector
image: <injected>
imagePullPolicy: IfNotPresent
terminationMessagePolicy: FallbackToLogsOnError
command: ["<injected>"]
resources:
requests:
cpu: 10m
memory: 32Mi
securityContext:
privileged: true
allowPrivilegeEscalation: true
hostIPC: false
hostNetwork: false
hostPID: true
priorityClassName: system-node-critical
serviceAccountName: tnf-setup-manager
nodeSelector:
node-role.kubernetes.io/master: ""
tolerations:
- operator: "Exists"
restartPolicy: OnFailure
2 changes: 1 addition & 1 deletion bindata/tnfdeployment/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
- name: tnf-job
image: <injected>
imagePullPolicy: IfNotPresent
command: [ "tnf-setup-runner", "<injected>" ]
command: [ "tnf-runtime", "<injected>" ]
terminationMessagePolicy: FallbackToLogsOnError
resources:
requests:
Expand Down
File renamed without changes.
22 changes: 18 additions & 4 deletions cmd/tnf-setup-runner/main.go → cmd/tnf-runtime/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
tnfaftersetup "github.com/openshift/cluster-etcd-operator/pkg/tnf/after-setup"
tnfauth "github.com/openshift/cluster-etcd-operator/pkg/tnf/auth"
tnffencing "github.com/openshift/cluster-etcd-operator/pkg/tnf/fencing"
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/pacemaker"
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/tools"
tnfsetup "github.com/openshift/cluster-etcd-operator/pkg/tnf/setup"
)
Expand All @@ -37,17 +38,18 @@ func main() {
logs.InitLogs()
defer logs.FlushLogs()

command := NewTnfSetupRunnerCommand()
command := NewTnfRuntimeCommand()
if err := command.Execute(); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
}

func NewTnfSetupRunnerCommand() *cobra.Command {
func NewTnfRuntimeCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "tnf-setup-runner",
Short: "OpenShift Two Node Fencing Setup runner",
Use: "tnf-runtime",
Short: "OpenShift Two Node Fencing Runtime",
Long: "Runtime commands for Two Node Fencing setup and monitoring operations",
Run: func(cmd *cobra.Command, args []string) {
cmd.Help()
os.Exit(1)
Expand All @@ -58,6 +60,7 @@ func NewTnfSetupRunnerCommand() *cobra.Command {
cmd.AddCommand(NewSetupCommand())
cmd.AddCommand(NewAfterSetupCommand())
cmd.AddCommand(NewFencingCommand())
cmd.AddCommand(NewMonitorCommand())

return cmd
}
Expand Down Expand Up @@ -113,3 +116,14 @@ func NewFencingCommand() *cobra.Command {
},
}
}

func NewMonitorCommand() *cobra.Command {
// Reuse the existing collector command as the "monitor" subcommand.
cmd := pacemaker.NewPacemakerStatusCollectorCommand()
cmd.Use = "monitor"
cmd.Short = "Monitor and collect pacemaker cluster status"
cmd.Long = "Collects pacemaker status and updates PacemakerStatus CR"
cmd.SilenceUsage = true
cmd.SilenceErrors = true
return cmd
}
177 changes: 177 additions & 0 deletions manifests/0000_25_etcd-operator_01_pacemakerstatus.crd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: pacemakerstatuses.etcd.openshift.io
annotations:
include.release.openshift.io/self-managed-high-availability: "true"
spec:
group: etcd.openshift.io
names:
kind: PacemakerStatus
listKind: PacemakerStatusList
plural: pacemakerstatuses
singular: pacemakerstatus
scope: Cluster
versions:
- name: v1alpha1
served: true
storage: true
schema:
openAPIV3Schema:
description: PacemakerStatus represents the current state of the Pacemaker cluster as reported by the pcs status command.
type: object
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation of an object.'
type: string
kind:
description: 'Kind is a string value representing the REST resource this object represents.'
type: string
metadata:
type: object
spec:
description: PacemakerStatusSpec defines the desired state of PacemakerStatus
type: object
properties:
nodeName:
description: NodeName identifies which node this status is for
type: string
status:
description: PacemakerStatusStatus contains the actual pacemaker cluster status information
type: object
properties:
lastUpdated:
description: LastUpdated is the timestamp when this status was last updated
type: string
format: date-time
rawXML:
description: RawXML contains the raw XML output from pcs status xml command
type: string
maxLength: 262144
collectionError:
description: CollectionError contains any error encountered while collecting status
type: string
summary:
description: Summary provides high-level counts and flags for the cluster state
type: object
properties:
pacemakerdState:
description: PacemakerdState indicates if pacemaker is running
type: string
hasQuorum:
description: HasQuorum indicates if the cluster has quorum
type: boolean
nodesOnline:
description: NodesOnline is the count of online nodes
type: integer
nodesTotal:
description: NodesTotal is the total count of configured nodes
type: integer
resourcesStarted:
description: ResourcesStarted is the count of started resources
type: integer
resourcesTotal:
description: ResourcesTotal is the total count of configured resources
type: integer
recentFailures:
description: RecentFailures indicates if there are recent operation failures
type: boolean
recentFencing:
description: RecentFencing indicates if there are recent fencing events
type: boolean
nodes:
description: Nodes provides detailed information about each node in the cluster
type: array
items:
type: object
required:
- name
- online
properties:
name:
description: Name is the name of the node
type: string
online:
description: Online indicates if the node is online
type: boolean
standby:
description: Standby indicates if the node is in standby mode
type: boolean
resources:
description: Resources provides detailed information about each resource in the cluster
type: array
items:
type: object
required:
- name
properties:
name:
description: Name is the name of the resource
type: string
resourceAgent:
description: ResourceAgent is the resource agent type
type: string
role:
description: Role is the current role of the resource
type: string
active:
description: Active indicates if the resource is active
type: boolean
node:
description: Node is the node where the resource is running
type: string
nodeHistory:
description: NodeHistory provides recent operation history for troubleshooting
type: array
items:
type: object
required:
- node
- resource
- operation
- rc
properties:
node:
description: Node is the node where the operation occurred
type: string
resource:
description: Resource is the resource that was operated on
type: string
operation:
description: Operation is the operation that was performed
type: string
rc:
description: RC is the return code from the operation
type: integer
rcText:
description: RCText is the human-readable return code text
type: string
lastRCChange:
description: LastRCChange is the timestamp when the RC last changed
type: string
format: date-time
fencingHistory:
description: FencingHistory provides recent fencing events
type: array
items:
type: object
required:
- target
- action
- status
properties:
target:
description: Target is the node that was fenced
type: string
action:
description: Action is the fencing action performed
type: string
status:
description: Status is the status of the fencing operation
type: string
completed:
description: Completed is the timestamp when the fencing completed
type: string
format: date-time
subresources:
status: {}
41 changes: 41 additions & 0 deletions pkg/operator/ceohelpers/external_etcd_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@ package ceohelpers

import (
"context"
"fmt"
"time"

configv1listers "github.com/openshift/client-go/config/listers/config/v1"
operatorv1informers "github.com/openshift/client-go/operator/informers/externalversions/operator/v1"
"github.com/openshift/library-go/pkg/operator/v1helpers"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"

"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/etcd"
Expand Down Expand Up @@ -158,3 +163,39 @@ func GetExternalEtcdClusterStatus(ctx context.Context,

return externalEtcdStatus, nil
}

// WaitForEtcdCondition is a generic helper that waits for an etcd-related condition to become true.
// It first syncs the etcd informer cache, then polls the condition function until it returns true
// or the timeout is reached.
func WaitForEtcdCondition(
ctx context.Context,
etcdInformer operatorv1informers.EtcdInformer,
operatorClient v1helpers.StaticPodOperatorClient,
conditionCheck func(context.Context, v1helpers.StaticPodOperatorClient) (bool, error),
pollInterval time.Duration,
timeout time.Duration,
conditionName string,
) error {
// Wait for the etcd informer to sync before checking condition
// This ensures operatorClient.GetStaticPodOperatorState() has data to work with
klog.Infof("waiting for etcd informer to sync before checking %s...", conditionName)
if !cache.WaitForCacheSync(ctx.Done(), etcdInformer.Informer().HasSynced) {
return fmt.Errorf("failed to sync etcd informer")
}
klog.Infof("etcd informer synced, checking for %s", conditionName)

// Poll until the condition is met
return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
conditionMet, err := conditionCheck(ctx, operatorClient)
if err != nil {
klog.Warningf("error checking %s, will retry: %v", conditionName, err)
return false, nil
}
if conditionMet {
klog.V(2).Infof("%s condition met", conditionName)
return true, nil
}
klog.V(4).Infof("%s condition not yet met, waiting...", conditionName)
return false, nil
})
}
Loading