Merge pull request #73672 from johnwilkins/TELCODOCS-1388

mburke5678 · web-flow · commit 0d2defde9908 · 2024-04-03T15:51:58.000-04:00
TELCODOCS-1388: Add IPI troubleshooting information to our documentation
diff --git a/installing/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc b/installing/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc
@@ -7,35 +7,41 @@ include::_attributes/common-attributes.adoc[]
 toc::[]
 
 
-== Troubleshooting the installer workflow
+== Troubleshooting the installation program workflow
 
 Prior to troubleshooting the installation environment, it is critical to understand the overall flow of the installer-provisioned installation on bare metal. The diagrams below provide a troubleshooting flow with a step-by-step breakdown for the environment.
 
 image:flow1.png[Flow-Diagram-1]
 
-_Workflow 1 of 4_ illustrates a troubleshooting workflow when the `install-config.yaml` file has errors or the {op-system-first} images are inaccessible.  Troubleshooting suggestions can be found at xref:ipi-install-troubleshooting-install-config_{context}[Troubleshooting `install-config.yaml`].
+_Workflow 1 of 4_ illustrates a troubleshooting workflow when the `install-config.yaml` file has errors or the {op-system-first} images are inaccessible. Troubleshooting suggestions can be found at xref:ipi-install-troubleshooting-install-config_ipi-install-troubleshooting[Troubleshooting `install-config.yaml`].
 
 image:flow2.png[Flow-Diagram-2]
 
-_Workflow 2 of 4_ illustrates a troubleshooting workflow for xref:ipi-install-troubleshooting-bootstrap-vm_{context}[ bootstrap VM issues], xref:ipi-install-troubleshooting-bootstrap-vm-cannot-boot_{context}[ bootstrap VMs that cannot boot up the cluster nodes], and  xref:ipi-install-troubleshooting-bootstrap-vm-inspecting-logs_{context}[ inspecting logs]. When installing an {product-title} cluster without the `provisioning` network, this workflow does not apply.
+_Workflow 2 of 4_ illustrates a troubleshooting workflow for xref:ipi-install-troubleshooting-bootstrap-vm_ipi-install-troubleshooting[ bootstrap VM issues], xref:ipi-install-troubleshooting-bootstrap-vm-cannot-boot_ipi-install-troubleshooting[ bootstrap VMs that cannot boot up the cluster nodes], and  xref:ipi-install-troubleshooting-bootstrap-vm-inspecting-logs_ipi-install-troubleshooting[ inspecting logs]. When installing an {product-title} cluster without the `provisioning` network, this workflow does not apply.
 
 image:flow3.png[Flow-Diagram-3]
 
-_Workflow 3 of 4_ illustrates a troubleshooting workflow for xref:ipi-install-troubleshooting-cluster-nodes-will-not-pxe_{context}[ cluster nodes that will not PXE boot]. If installing using RedFish Virtual Media, each node must meet minimum firmware requirements for the installer to deploy the node. See *Firmware requirements for installing with virtual media* in the *Prerequisites* section for additional details.
+_Workflow 3 of 4_ illustrates a troubleshooting workflow for xref:ipi-install-troubleshooting-cluster-nodes-will-not-pxe_ipi-install-troubleshooting[ cluster nodes that will not PXE boot]. If installing using RedFish Virtual Media, each node must meet minimum firmware requirements for the installation program to deploy the node. See *Firmware requirements for installing with virtual media* in the *Prerequisites* section for additional details.
 
 image:flow4.png[Flow-Diagram-4]
 
 _Workflow 4 of 4_ illustrates a troubleshooting workflow from
-xref:ipi-install-troubleshooting-api-not-accessible_{context}[ a non-accessible API] to a xref:ipi-install-troubleshooting-reviewing-the-installation_{context}[validated installation].
+xref:investigating-an-unavailable-kubernetes-api_ipi-install-troubleshooting[ a non-accessible API] to a xref:ipi-install-troubleshooting-reviewing-the-installation_ipi-install-troubleshooting[validated installation].
 
 
 include::modules/ipi-install-troubleshooting-install-config.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting-bootstrap-vm.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting-bootstrap-vm-cannot-boot.adoc[leveloffset=+2]
 include::modules/ipi-install-troubleshooting-bootstrap-vm-inspecting-logs.adoc[leveloffset=+2]
+include::modules/ipi-install-troubleshooting-investigating-an-unavailable-kubernetes-api.adoc[leveloffset=+1]
+include::modules/ipi-install-troubleshooting-troubleshooting-failure-to-initialize-the-cluster.adoc[leveloffset=+1]
+include::modules/ipi-install-troubleshooting-troubleshooting-failure-to-fetch-the-console-url.adoc[leveloffset=+1]
+include::modules/ipi-install-troubleshooting-troubleshooting-failure-to-add-the-ingress-certificate-to-kubeconfig.adoc[leveloffset=+1]
+include::modules/ipi-install-troubleshooting-troubleshooting-ssh-access-to-cluster-nodes.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting-cluster-nodes-will-not-pxe.adoc[leveloffset=+1]
+include::modules/ipi-install-troubleshooting-installing-creates-no-worker-nodes.adoc[leveloffset=+1]
+include::modules/ipi-install-troubleshooting-troubleshooting-the-cluster-network-operator.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting_unable-to-discover-new-bare-metal-hosts-using-the-bmc.adoc[leveloffset=+1]
-include::modules/ipi-install-troubleshooting-api-not-accessible.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting_proc_worker-nodes-cannot-join-the-cluster.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting-cleaning-up-previous-installations.adoc[leveloffset=+1]
 include::modules/ipi-install-troubleshooting-registry-issues.adoc[leveloffset=+1]
diff --git a/modules/ipi-install-troubleshooting-bootstrap-vm.adoc b/modules/ipi-install-troubleshooting-bootstrap-vm.adoc
@@ -4,7 +4,7 @@
 :_mod-docs-content-type: PROCEDURE
 [id="ipi-install-troubleshooting-bootstrap-vm_{context}"]
 
-= Bootstrap VM issues
+= Troubleshooting bootstrap VM issues
 
 The {product-title} installation program spawns a bootstrap node virtual machine, which handles provisioning the {product-title} cluster nodes.
 
@@ -28,10 +28,8 @@ $ sudo virsh list
 ====
 The name of the bootstrap VM is always the cluster name followed by a random set of characters and ending in the word "bootstrap."
 ====
-+
-If the bootstrap VM is not running after 10-15 minutes, troubleshoot why it is not running. Possible issues include:
 
-. Verify `libvirtd` is running on the system:
+. If the bootstrap VM is not running after 10-15 minutes, verify `libvirtd` is running on the system by executing the following command:
 +
 [source,terminal]
 ----
@@ -79,7 +77,6 @@ localhost login:
 When deploying an {product-title} cluster without the `provisioning` network, you must use a public IP address and not a private IP address like `172.22.0.2`.
 ====
 
-
 . After you obtain the IP address, log in to the bootstrap VM using the `ssh` command:
 +
 [NOTE]
@@ -95,12 +92,8 @@ $ ssh core@172.22.0.2
 If you are not successful logging in to the bootstrap VM, you have likely encountered one of the following scenarios:
 
 * You cannot reach the `172.22.0.0/24` network. Verify the network connectivity between the provisioner and the `provisioning` network bridge. This issue might occur if you are using a `provisioning` network.
-`
-* You cannot reach the bootstrap VM through the public network. When attempting
-to SSH via `baremetal` network, verify connectivity on the
+
+* You cannot reach the bootstrap VM through the public network. When attempting to SSH via `baremetal` network, verify connectivity on the
 `provisioner` host specifically around the `baremetal` network bridge.
 
-* You encountered `Permission denied (publickey,password,keyboard-interactive)`. When
-attempting to access the bootstrap VM, a `Permission denied` error
-might occur. Verify that the SSH key for the user attempting to log
-in to the VM is set within the `install-config.yaml` file.
+* You encountered `Permission denied (publickey,password,keyboard-interactive)`. When attempting to access the bootstrap VM, a `Permission denied` error might occur. Verify that the SSH key for the user attempting to log in to the VM is set within the `install-config.yaml` file.
diff --git a/modules/ipi-install-troubleshooting-installing-creates-no-worker-nodes.adoc b/modules/ipi-install-troubleshooting-installing-creates-no-worker-nodes.adoc
@@ -0,0 +1,40 @@
+// This module is included in the following assemblies: 
+//
+// installing/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc
+
+:_mod-docs-content-type: PROCEDURE
+[id="installing-creates-no-worker-nodes_{context}"]
+= Installing creates no worker nodes
+
+The installation program does not provision worker nodes directly. Instead, the Machine API Operator scales nodes up and down on supported platforms. If worker nodes are not created after 15 to 20 minutes, depending on the speed of the cluster's internet connection, investigate the Machine API Operator.
+
+.Procedure
+
+. Check the Machine API Operator by running the following command:
++
+[source,terminal]
+----
+$ oc --kubeconfig=${INSTALL_DIR}/auth/kubeconfig \
+   --namespace=openshift-machine-api get deployments
+----
++
+If `${INSTALL_DIR}` is not set in your environment, replace the value with the name of the installation directory.
++
+.Example output
+[source,terminal]
+----
+NAME                          READY   UP-TO-DATE   AVAILABLE   AGE
+cluster-autoscaler-operator   1/1     1            1           86m
+cluster-baremetal-operator    1/1     1            1           86m
+machine-api-controllers       1/1     1            1           85m
+machine-api-operator          1/1     1            1           86m
+----
+
+. Check the machine controller logs by running the following command:
++
+[source,terminal]
+----
+$ oc --kubeconfig=${INSTALL_DIR}/auth/kubeconfig \
+     --namespace=openshift-machine-api logs deployments/machine-api-controllers \
+     --container=machine-controller
+----
diff --git a/modules/ipi-install-troubleshooting-investigating-an-unavailable-kubernetes-api.adoc b/modules/ipi-install-troubleshooting-investigating-an-unavailable-kubernetes-api.adoc
@@ -1,16 +1,32 @@
-// Module included in the following assemblies:
-// //installing/installing_bare_metal_ipi/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc
+// This module is included in the following assemblies: 
+//
+// installing/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc
 
 :_mod-docs-content-type: PROCEDURE
-[id="ipi-install-troubleshooting-api-not-accessible_{context}"]
+[id="investigating-an-unavailable-kubernetes-api_{context}"]
+= Investigating an unavailable Kubernetes API
 
-= The API is not accessible
-
-When the cluster is running and clients cannot access the API, domain name resolution issues might impede access to the API.
+When the Kubernetes API is unavailable, check the control plane nodes to ensure that they are running the correct components. Also, check the hostname resolution.
 
 .Procedure
 
-. **Hostname Resolution:** Check the cluster nodes to ensure they have a fully qualified domain name, and not just `localhost.localdomain`. For example:
+.  Ensure that `etcd` is running on each of the control plane nodes by running the following command:
++
+[source,terminal]
+----
+$ sudo crictl logs $(sudo crictl ps --pod=$(sudo crictl pods --name=etcd-member --quiet) --quiet)
+----
+
+. If the previous command fails, ensure that Kublet created the `etcd` pods by running the following command:
++
+[source,terminal]
+----
+$ sudo crictl pods --name=etcd-member
+----
++
+If there are no pods, investigate `etcd`.
+
+. Check the cluster nodes to ensure they have a fully qualified domain name, and not just `localhost.localdomain`, by using the following command:
 +
 [source,terminal]
 ----
@@ -21,10 +37,10 @@ If a hostname is not set, set the correct hostname. For example:
 +
 [source,terminal]
 ----
-$ hostnamectl set-hostname <hostname>
+$ sudo hostnamectl set-hostname <hostname>
 ----
 
-. **Incorrect Name Resolution:** Ensure that each node has the correct name resolution in the DNS server using `dig` and `nslookup`. For example:
+. Ensure that each node has the correct name resolution in the DNS server using the `dig` command:
 +
 [source,terminal]
 ----
diff --git a/modules/ipi-install-troubleshooting-misc-issues.adoc b/modules/ipi-install-troubleshooting-misc-issues.adoc
@@ -14,7 +14,7 @@ After the deployment of a cluster you might receive the following error:
 `runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: Missing CNI default network`
 ----
 
-The Cluster Network Operator is responsible for deploying the networking components in response to a special object created by the installer. It runs very early in the installation process, after the control plane (master) nodes have come up, but before the bootstrap control plane has been torn down. It can be indicative of more subtle installer issues, such as long delays in bringing up control plane (master) nodes or issues with `apiserver` communication.
+The Cluster Network Operator is responsible for deploying the networking components in response to a special object created by the installation program. It runs very early in the installation process, after the control plane (master) nodes have come up, but before the bootstrap control plane has been torn down. It can be indicative of more subtle installation program issues, such as long delays in bringing up control plane (master) nodes or issues with `apiserver` communication.
 
 .Procedure
 
@@ -54,7 +54,7 @@ spec:
   networkType: OVNKubernetes
 ----
 +
-If it does not exist, the installer did not create it. To determine why the installer did not create it, execute the following:
+If it does not exist, the installation program did not create it. To determine why the installation program did not create it, execute the following:
 +
 [source,terminal]
 ----
@@ -75,7 +75,7 @@ $ kubectl -n openshift-network-operator get pods
 $ kubectl -n openshift-network-operator logs -l "name=network-operator"
 ----
 +
-On high availability clusters with three or more control plane (master) nodes, the Operator will perform leader election and all other Operators will sleep. For additional details, see https://github.com/openshift/installer/blob/master/docs/user/troubleshooting.md[Troubleshooting].
+On high availability clusters with three or more control plane nodes, the Operator will perform leader election and all other Operators will sleep. For additional details, see https://github.com/openshift/installer/blob/master/docs/user/troubleshooting.md[Troubleshooting].
 
 == Addressing the "No disk found with matching rootDeviceHints" error message
 
diff --git a/modules/ipi-install-troubleshooting-reviewing-the-installation.adoc b/modules/ipi-install-troubleshooting-reviewing-the-installation.adoc
@@ -6,7 +6,7 @@
 
 = Reviewing the installation
 
-After installation, ensure the installer deployed the nodes and pods successfully.
+After installation, ensure the installation program deployed the nodes and pods successfully.
 
 .Procedure
 
@@ -25,7 +25,7 @@ master-1.example.com   Ready    master,worker   4h   v1.28.5
 master-2.example.com   Ready    master,worker   4h   v1.28.5
 ----
 
-. Confirm the installer deployed all pods successfully. The following command
+. Confirm the installation program deployed all pods successfully. The following command
 removes any pods that are still running or have completed as part of the output.
 +
 [source,terminal]
diff --git a/modules/ipi-install-troubleshooting-troubleshooting-failure-to-add-the-ingress-certificate-to-kubeconfig.adoc b/modules/ipi-install-troubleshooting-troubleshooting-failure-to-add-the-ingress-certificate-to-kubeconfig.adoc
@@ -0,0 +1,44 @@
+// This module is included in the following assemblies: 
+//
+// installing/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc
+
+:_mod-docs-content-type: PROCEDURE
+[id="troubleshooting-failure-to-add-the-ingress-certificate-to-kubeconfig_{context}"]
+= Troubleshooting a failure to add the ingress certificate to kubeconfig
+
+The installation program adds the default ingress certificate to the list of trusted client certificate authorities in `${INSTALL_DIR}/auth/kubeconfig`. If the installation program fails to add the ingress certificate to the `kubeconfig` file, you can retrieve the certificate from the cluster and add it.
+
+.Procedure
+
+. Retrieve the certificate from the cluster using the following command:
++
+[source,terminal]
+----
+$ oc --kubeconfig=${INSTALL_DIR}/auth/kubeconfig get configmaps default-ingress-cert \
+     -n openshift-config-managed -o=jsonpath='{.data.ca-bundle\.crt}'
+----
++
+[source,terminal]
+----
+-----BEGIN CERTIFICATE-----
+MIIC/TCCAeWgAwIBAgIBATANBgkqhkiG9w0BAQsFADAuMSwwKgYDVQQDDCNjbHVz
+dGVyLWluZ3Jlc3Mtb3BlcmF0b3JAMTU1MTMwNzU4OTAeFw0xOTAyMjcyMjQ2Mjha
+Fw0yMTAyMjYyMjQ2MjlaMC4xLDAqBgNVBAMMI2NsdXN0ZXItaW5ncmVzcy1vcGVy
+YXRvckAxNTUxMzA3NTg5MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
+uCA4fQ+2YXoXSUL4h/mcvJfrgpBfKBW5hfB8NcgXeCYiQPnCKblH1sEQnI3VC5Pk
+2OfNCF3PUlfm4i8CHC95a7nCkRjmJNg1gVrWCvS/ohLgnO0BvszSiRLxIpuo3C4S
+EVqqvxValHcbdAXWgZLQoYZXV7RMz8yZjl5CfhDaaItyBFj3GtIJkXgUwp/5sUfI
+LDXW8MM6AXfuG+kweLdLCMm3g8WLLfLBLvVBKB+4IhIH7ll0buOz04RKhnYN+Ebw
+tcvFi55vwuUCWMnGhWHGEQ8sWm/wLnNlOwsUz7S1/sW8nj87GFHzgkaVM9EOnoNI
+gKhMBK9ItNzjrP6dgiKBCQIDAQABoyYwJDAOBgNVHQ8BAf8EBAMCAqQwEgYDVR0T
+AQH/BAgwBgEB/wIBADANBgkqhkiG9w0BAQsFAAOCAQEAq+vi0sFKudaZ9aUQMMha
+CeWx9CZvZBblnAWT/61UdpZKpFi4eJ2d33lGcfKwHOi2NP/iSKQBebfG0iNLVVPz
+vwLbSG1i9R9GLdAbnHpPT9UG6fLaDIoKpnKiBfGENfxeiq5vTln2bAgivxrVlyiq
++MdDXFAWb6V4u2xh6RChI7akNsS3oU9PZ9YOs5e8vJp2YAEphht05X0swA+X8V8T
+C278FFifpo0h3Q0Dbv8Rfn4UpBEtN4KkLeS+JeT+0o2XOsFZp7Uhr9yFIodRsnNo
+H/Uwmab28ocNrGNiEVaVH6eTTQeeZuOdoQzUbClElpVmkrNGY0M42K0PvOQ/e7+y
+AQ==
+-----END CERTIFICATE-----
+----
+
+. Add the certificate to the `client-certificate-authority-data` field in the `${INSTALL_DIR}/auth/kubeconfig` file.
diff --git a/modules/ipi-install-troubleshooting-troubleshooting-failure-to-fetch-the-console-url.adoc b/modules/ipi-install-troubleshooting-troubleshooting-failure-to-fetch-the-console-url.adoc
@@ -0,0 +1,72 @@
+// This module is included in the following assemblies: 
+//
+// installing/installing_bare_metal_ipi/ipi-install-troubleshooting.adoc
+
+:_mod-docs-content-type: PROCEDURE
+[id="troubleshooting-failure-to-fetch-the-console-url_{context}"]
+= Troubleshooting a failure to fetch the console URL
+
+The installation program retrieves the URL for the {product-title} console by using `[route][route-object]` within the `openshift-console` namespace. If the installation program fails the retrieve the URL for the console, use the following procedure.
+
+.Procedure
+
+. Check if the console router is in the `Available` or `Failing` state by running the following command:
++
+[source,terminal]
+----
+$ oc --kubeconfig=${INSTALL_DIR}/auth/kubeconfig get clusteroperator console -oyaml
+----
++
+[source,yaml]
+----
+apiVersion: config.openshift.io/v1
+kind: ClusterOperator
+metadata:
+  creationTimestamp: 2019-02-27T22:46:57Z
+  generation: 1
+  name: console
+  resourceVersion: "19682"
+  selfLink: /apis/config.openshift.io/v1/clusteroperators/console
+  uid: 960364aa-3ae1-11e9-bad4-0a97b6ba9358
+spec: {}
+status:
+  conditions:
+  - lastTransitionTime: 2019-02-27T22:46:58Z
+    status: "False"
+    type: Failing
+  - lastTransitionTime: 2019-02-27T22:50:12Z
+    status: "False"
+    type: Progressing
+  - lastTransitionTime: 2019-02-27T22:50:12Z
+    status: "True"
+    type: Available
+  - lastTransitionTime: 2019-02-27T22:46:57Z
+    status: "True"
+    type: Upgradeable
+  extension: null
+  relatedObjects:
+  - group: operator.openshift.io
+    name: cluster
+    resource: consoles
+  - group: config.openshift.io
+    name: cluster
+    resource: consoles
+  - group: oauth.openshift.io
+    name: console
+    resource: oauthclients
+  - group: ""
+    name: openshift-console-operator
+    resource: namespaces
+  - group: ""
+    name: openshift-console
+    resource: namespaces
+  versions: null
+----
+
+. Manually retrieve the console URL by executing the following command:
++
+[source,terminal]
+----
+$ oc --kubeconfig=${INSTALL_DIR}/auth/kubeconfig get route console -n openshift-console \
+     -o=jsonpath='{.spec.host}' console-openshift-console.apps.adahiya-1.devcluster.openshift.com
+----
diff --git a/modules/ipi-install-troubleshooting-troubleshooting-failure-to-initialize-the-cluster.adoc b/modules/ipi-install-troubleshooting-troubleshooting-failure-to-initialize-the-cluster.adoc
diff --git a/modules/ipi-install-troubleshooting-troubleshooting-ssh-access-to-cluster-nodes.adoc b/modules/ipi-install-troubleshooting-troubleshooting-ssh-access-to-cluster-nodes.adoc
diff --git a/modules/ipi-install-troubleshooting-troubleshooting-the-cluster-network-operator.adoc b/modules/ipi-install-troubleshooting-troubleshooting-the-cluster-network-operator.adoc