@@ -78,6 +78,14 @@ const (
7878 // instanceReplicationInfoTypePrimaryStandby is the label used by Patroni to indicate that an
7979 // instance is indeed a primary PostgreSQL instance, specifically within a standby cluster
8080 instanceReplicationInfoTypePrimaryStandby = "Standby Leader"
81+ // instanceRolePrimary indicates that an instance is a primary
82+ instanceRolePrimary = "primary"
83+ // instanceRoleReplica indicates that an instance is a replica
84+ instanceRoleReplica = "replica"
85+ // instanceRoleUnknown indicates taht an instance is of an unknown typ
86+ instanceRoleUnknown = "unknown"
87+ // instanceStatusUnavailable indicates an instance is unavailable
88+ instanceStatusUnavailable = "unavailable"
8189)
8290
8391var (
@@ -136,20 +144,34 @@ func GetPod(clientset *kubernetes.Clientset, deploymentName, namespace string) (
136144// By default information is only returned for replicas within the cluster. However,
137145// if primary information is also needed, the inlcudePrimary flag can set set to true
138146// and primary information will will also be included in the ReplicationStatusResponse.
139- func ReplicationStatus (request ReplicationStatusRequest , includePrimary bool ) (ReplicationStatusResponse , error ) {
147+ //
148+ // Also by default we do not include any "busted" Pods, e.g. a Pod that is not
149+ // in a happy phase. That Pod may be lacking a "role" label. From there, we zero
150+ // out the statistics and apply an error
151+ func ReplicationStatus (request ReplicationStatusRequest , includePrimary , includeBusted bool ) (ReplicationStatusResponse , error ) {
140152 response := ReplicationStatusResponse {
141153 Instances : make ([]InstanceReplicationInfo , 0 ),
142154 }
143155
144- // First, get replica pods using selector pg-cluster=clusterName,role=replica if not including the primary,
145- // or pg-cluster=clusterName,pg-database if including the primary
146- var roleSelector string
156+ // Build up the selector. First, create the base, which restricts to the
157+ // current cluster
158+ // pg-cluster=clusterName,pgo-pg-database
159+ selector := fmt .Sprintf ("%s=%s,%s" ,
160+ config .LABEL_PG_CLUSTER , request .ClusterName , config .LABEL_PG_DATABASE )
161+
162+ // if we are not including the primary, determine if we are including busted
163+ // replicas or not
147164 if ! includePrimary {
148- roleSelector = fmt .Sprintf ("%s=%s" , config .LABEL_PGHA_ROLE , config .LABEL_PGHA_ROLE_REPLICA )
149- } else {
150- roleSelector = config .LABEL_PG_DATABASE
165+ if includeBusted {
166+ // include all Pods that identify as a database, but **not** a primary
167+ // pg-cluster=clusterName,pgo-pg-database,role!=config.LABEL_PGHA_ROLE_PRIMARY
168+ selector += fmt .Sprintf (",%s!=%s" , config .LABEL_PGHA_ROLE , config .LABEL_PGHA_ROLE_PRIMARY )
169+ } else {
170+ // include all Pods that identify as a database and have a replica label
171+ // pg-cluster=clusterName,pgo-pg-database,role=replica
172+ selector += fmt .Sprintf (",%s=%s" , config .LABEL_PGHA_ROLE , config .LABEL_PGHA_ROLE_REPLICA )
173+ }
151174 }
152- selector := fmt .Sprintf ("%s=%s,%s" , config .LABEL_PG_CLUSTER , request .ClusterName , roleSelector )
153175
154176 log .Debugf (`searching for pods with "%s"` , selector )
155177 pods , err := kubeapi .GetPods (request .Clientset , selector , request .Namespace )
@@ -175,8 +197,36 @@ func ReplicationStatus(request ReplicationStatusRequest, includePrimary bool) (R
175197 // Now get the statistics about the current state of the replicas, which we
176198 // can delegate to Patroni vis-a-vis the information that it collects
177199 // We can get the statistics about the current state of the managed instance
178- // From executing and running a command in the first pod
179- pod := pods .Items [0 ]
200+ // From executing and running a command in the first active pod
201+ var pod * v1.Pod
202+
203+ for _ , p := range pods .Items {
204+ if p .Status .Phase == v1 .PodRunning {
205+ pod = & p
206+ break
207+ }
208+ }
209+
210+ // if no active Pod can be found, we can only assume that all of the instances
211+ // are unavailable, and we should indicate as such
212+ if pod == nil {
213+ for _ , p := range pods .Items {
214+ // set up the instance that will be returned
215+ instance := InstanceReplicationInfo {
216+ Name : instanceInfoMap [p .Name ].name ,
217+ Node : instanceInfoMap [p .Name ].node ,
218+ ReplicationLag : - 1 ,
219+ Role : instanceRoleUnknown ,
220+ Status : instanceStatusUnavailable ,
221+ Timeline : - 1 ,
222+ }
223+
224+ // append this newly created instance to the list that will be returned
225+ response .Instances = append (response .Instances , instance )
226+ }
227+
228+ return response , nil
229+ }
180230
181231 // Execute the command that will retrieve the replica information from Patroni
182232 commandStdOut , _ , err := kubeapi .ExecToPodThroughAPI (
@@ -197,17 +247,25 @@ func ReplicationStatus(request ReplicationStatusRequest, includePrimary bool) (R
197247 // We need to iterate through this list to format the information for the
198248 // response
199249 for _ , rawInstance := range rawInstances {
200-
201250 var role string
251+
202252 // skip the primary unless explicitly enabled
203- if rawInstance .Type == instanceReplicationInfoTypePrimary ||
204- rawInstance .Type == instanceReplicationInfoTypePrimaryStandby {
205- if ! includePrimary {
206- continue
207- }
208- role = "primary"
209- } else {
210- role = "replica"
253+ if ! includePrimary && (rawInstance .Type == instanceReplicationInfoTypePrimary ||
254+ rawInstance .Type == instanceReplicationInfoTypePrimaryStandby ) {
255+ continue
256+ }
257+
258+ // if this is a busted instance and we are not including it, skip
259+ if ! includeBusted && rawInstance .State == "" {
260+ continue
261+ }
262+
263+ // determine the role of the instnace
264+ switch rawInstance .Type {
265+ default :
266+ role = instanceRoleReplica
267+ case instanceReplicationInfoTypePrimary , instanceReplicationInfoTypePrimaryStandby :
268+ role = instanceRolePrimary
211269 }
212270
213271 // set up the instance that will be returned
@@ -218,11 +276,14 @@ func ReplicationStatus(request ReplicationStatusRequest, includePrimary bool) (R
218276 Role : role ,
219277 Name : instanceInfoMap [rawInstance .PodName ].name ,
220278 Node : instanceInfoMap [rawInstance .PodName ].node ,
279+ PendingRestart : rawInstance .PendingRestart == "*" ,
221280 }
222281
223- // indicate whether or not the instance has a pending restart
224- if rawInstance .PendingRestart == "*" {
225- instance .PendingRestart = true
282+ // update the instance info if the instance is busted
283+ if rawInstance .State == "" {
284+ instance .Status = instanceStatusUnavailable
285+ instance .ReplicationLag = - 1
286+ instance .Timeline = - 1
226287 }
227288
228289 // append this newly created instance to the list that will be returned
0 commit comments