Pvc utilization refactor (#1133)

jayasimha-raghavan-unskript · web-flow · commit 8b3686dff6ba · 2025-03-10T10:01:22.000-07:00
diff --git a/.github/workflows/all_module_test.yml b/.github/workflows/all_module_test.yml
@@ -25,6 +25,11 @@ jobs:
       
       - uses: actions/checkout@v3
 
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v1
         with:
@@ -36,18 +41,29 @@ jobs:
       - name: Install system dependencies
         run: |
           pip install shyaml
-          pip install --upgrade pip==22.2.2
+          pip install --upgrade pip
           sudo apt update
           sudo apt install -y wget
-          sudo apt install -y awscli
+          
+          # Install NumPy first with a compatible version
+          pip install numpy>=1.22.0
+          
+          # Install PyArrow with binary wheel - no build required
+          pip install pyarrow --only-binary=pyarrow
+          
+          # Continue with other dependencies
           aws s3 cp ${{ secrets.BUILD_REQUIREMENTS }} /tmp/requirements.txt
-          pip install --no-cache-dir -r /tmp/requirements.txt
-          aws s3 cp   ${{ secrets.MAIN_MODULE_BUILD_PACKAGE }}  /tmp/main_module.tar.gz
+          pip install --no-cache-dir -r /tmp/requirements.txt || true
+          
+          # Install main and sub modules
+          aws s3 cp ${{ secrets.MAIN_MODULE_BUILD_PACKAGE }} /tmp/main_module.tar.gz
           pip install --no-cache-dir /tmp/main_module.tar.gz
-          aws s3 cp  ${{ secrets.SUB_MODULE_BUILD_PACKAGE }} /tmp/sub_module.tar.gz
+          aws s3 cp ${{ secrets.SUB_MODULE_BUILD_PACKAGE }} /tmp/sub_module.tar.gz
           pip install --no-cache-dir /tmp/sub_module.tar.gz
+          
+          # Additional dependencies
           pip install --no-cache-dir matplotlib>=3.7.1
+          pip install setuptools wheel cython
         
       - name: Run All Modules Check
-
         run: /usr/bin/env python all_modules_test.py
diff --git a/Kubernetes/legos/k8s_check_service_pvc_utilization/k8s_check_service_pvc_utilization.py b/Kubernetes/legos/k8s_check_service_pvc_utilization/k8s_check_service_pvc_utilization.py
@@ -8,18 +8,23 @@
 from pydantic import BaseModel, Field
 from kubernetes.client.rest import ApiException
 
+
 class InputSchema(BaseModel):
-     namespace: str = Field(..., description='The namespace in which the service resides.', title='Namespace')
-     core_services: list = Field(
-         ...,
-         description='List of services for which the used PVC size needs to be checked.',
-         title='K8s Sservice name',
-     )
-     threshold: Optional[int] = Field(
-         80,
-         description='Percentage threshold for utilized PVC disk size.E.g., a 80% threshold checks if the utilized space exceeds 80% of the total PVC capacity.',
-         title='Threshold (in %)',
-     )
+    namespace: str = Field(
+        ...,
+        description="The namespace in which the service resides.",
+        title="Namespace",
+    )
+    core_services: list = Field(
+        ...,
+        description="List of services for which the used PVC size needs to be checked.",
+        title="K8s Service name",
+    )
+    threshold: Optional[int] = Field(
+        80,
+        description="Percentage threshold for utilized PVC disk size.E.g., a 80% threshold checks if the utilized space exceeds 80% of the total PVC capacity.",
+        title="Threshold (in %)",
+    )
 
 
 def k8s_check_service_pvc_utilization_printer(output):
@@ -28,13 +33,18 @@ def k8s_check_service_pvc_utilization_printer(output):
     if status:
         print("Disk sizes for all checked services are within the threshold.")
     else:
-        print("ALERT: One or more PVC disk sizes are below the threshold:")
+        print("ALERT: One or more PVC disk sizes are above threshold:")
         print("-" * 40)
         for pvc in pvc_info:
-            print(f"PVC: {pvc['pvc_name']} - Utilized: {pvc['used']} of {pvc['capacity']}")
+            print(
+                f"PVC: {pvc['pvc_name']} - Utilized: {pvc['used']} of {pvc['capacity']}"
+            )
         print("-" * 40)
 
-def k8s_check_service_pvc_utilization(handle, core_services: list, namespace:str, threshold: int = 80) -> Tuple:
+
+def k8s_check_service_pvc_utilization(
+    handle, core_services: list, namespace: str, threshold: int = 80
+) -> Tuple:
     """
     k8s_check_service_pvc_utilization checks the utilized disk size of a service's PVC against a given threshold.
 
@@ -55,7 +65,7 @@ def k8s_check_service_pvc_utilization(handle, core_services: list, namespace:str
     :type namespace: str
     :param namespace: The namespace in which the service resides.
 
-    :return: Status and dictionary with PVC name and its size information if the PVC's disk size is below the threshold.
+    :return: Status and dictionary with PVC name and its size information if the PVC's disk size exceeds threshold.
     """
 
     alert_pvcs_all_services = []
@@ -68,122 +78,166 @@ def k8s_check_service_pvc_utilization(handle, core_services: list, namespace:str
         if not response.stdout.strip():
             # No labels found for a particular service. Skipping...
             continue
-        labels_dict = json.loads(response.stdout.replace("'", "\""))
+        labels_dict = json.loads(response.stdout.replace("'", '"'))
         label_selector = ",".join([f"{k}={v}" for k, v in labels_dict.items()])
 
         # Fetch the pod attached to this service.
         # The safer option is to try with the * option. Having a specific index like 0 or 1
-        # will lead to ApiException. 
+        # will lead to ApiException.
         get_pod_command = f"kubectl get pods -n {namespace} -l {label_selector} -o=jsonpath='{{.items[*].metadata.name}}'"
         response = handle.run_native_cmd(get_pod_command)
         if not response or response.stderr:
-            raise ApiException(f"Error while executing command ({get_pod_command}): {response.stderr if response else 'empty response'}")
+            raise ApiException(
+                f"Error while executing command ({get_pod_command}): {response.stderr if response else 'empty response'}"
+            )
 
         # pod_names stores the output from the above kubectl command, which is a list of pod_names separated by space
         pod_names = response.stdout.strip()
         if not pod_names:
             # No pods found for service {svc} in namespace {namespace} with labels {label_selector}
             continue
-        
+
         # Fetch PVCs attached to the pod
-        # The Above kubectl command would return a string that is space separated name(s) of the pod. 
+        # The Above kubectl command would return a string that is space separated name(s) of the pod.
         # Given such a string, lets find out if we have one or more than one pod name in the string.
         # If there are more than one pod name in the output, we need to iterate over all items[] array.
-        # Else we can directly access the persistentVolumeClaim name 
+        # Else we can directly access the persistentVolumeClaim name
         # Lets also associate the pod_name along with the claim name (PVC Name) in the format of
         # pod_name:pv_claim_name
-        
+
         if len(pod_names.split()) > 1:
-            json_path_cmd = "{range .items[*]}{.metadata.name}:{range .spec.volumes[*].persistentVolumeClaim}{.claimName} {end}{\"\\n\"}{end}"
+            json_path_cmd = '{range .items[*]}{.metadata.name}:{range .spec.volumes[*].persistentVolumeClaim}{.claimName} {end}{"\\n"}{end}'
         else:
             json_path_cmd = "{.metadata.name}:{range .spec.volumes[*].persistentVolumeClaim}{.claimName}{end}"
 
-        get_pvc_names_command = f"kubectl get pod {pod_names} -n {namespace} -o=jsonpath='{json_path_cmd}'"
-
+        get_pvc_names_command = (
+            f"kubectl get pod {pod_names} -n {namespace} -o=jsonpath='{json_path_cmd}'"
+        )
 
         response = handle.run_native_cmd(get_pvc_names_command)
         if not response or response.stderr:
-            raise ApiException(f"Error while executing command ({get_pvc_names_command}): {response.stderr if response else 'empty response'}")
+            raise ApiException(
+                f"Error while executing command ({get_pvc_names_command}): {response.stderr if response else 'empty response'}"
+            )
         # Example: ['lightbeam-elasticsearch-master-0:data-lightbeam-elasticsearch-master-0']
         pod_and_pvc_names = response.stdout.strip().split()
 
-
-        # The pod_and_pvc_names 
+        # The pod_and_pvc_names
         if not pod_and_pvc_names:
             services_without_pvcs.append(svc)
             continue
 
         pvc_mounts = []
         alert_pvcs = []
         all_pvcs = []
-        
+
         for element in pod_and_pvc_names:
-            pod_name, claim_name = element.split(':')
+            pod_name, claim_name = element.split(":")
             if not claim_name:
                 # Skip if Volume Claim name is empty.
-                continue 
+                continue
 
-            # Fetch the Pod JSON 
+            # Fetch the Pod JSON
             # We need to get the container name (if any) from the Pod's JSON. This is needed
             # if we want to exec into the POD that is within a container. The JSON data that
             # we obtain is used to fill the pvc_mounts list, which is a list of dictionaries.
             # We use this pvc_mounts to find out the used_space percentage. We compare that with
-            # the threshold to flag if the utilization is above threshold. 
+            # the threshold to flag if the utilization is above threshold.
             # df -kh is the command used to get the disk utilization. This is accurate as we get
             # the disk utilization from the POD directly, rather than checking the resource limit
-            # and resource request from the deployment / stateful YAML file. 
+            # and resource request from the deployment / stateful YAML file.
             get_pod_json_command = f"kubectl get pod {pod_name} -n {namespace} -o json"
             pod_json_output = handle.run_native_cmd(get_pod_json_command)
             if not pod_json_output or pod_json_output.stderr:
-                raise ApiException(f"Error fetching pod json for {pod_name}: {pod_json_output.stderr if pod_json_output else 'empty response'}")
+                raise ApiException(
+                    f"Error fetching pod json for {pod_name}: {pod_json_output.stderr if pod_json_output else 'empty response'}"
+                )
             pod_data = json.loads(pod_json_output.stdout)
-    
+
             # Dictionary .get() method with default value is way of error handling
-            for container in pod_data.get('spec', {}).get('containers', {}):
-                for mount in container.get('volumeMounts', {}):
-                    for volume in pod_data.get('spec', {}).get('volumes', {}):
-                        if 'persistentVolumeClaim' in volume and volume.get('name') == mount.get('name'):
+            for container in pod_data.get("spec", {}).get("containers", {}):
+                for mount in container.get("volumeMounts", {}):
+                    for volume in pod_data.get("spec", {}).get("volumes", {}):
+                        if "persistentVolumeClaim" in volume and volume.get(
+                            "name"
+                        ) == mount.get("name"):
                             try:
-                                claim_name = volume['persistentVolumeClaim']['claimName']
-                                pvc_mounts.append({
-                                    "container_name": container['name'],
-                                    "mount_path": mount['mountPath'],
-                                    "pvc_name": claim_name if claim_name else None
-                                })
+                                claim_name = volume["persistentVolumeClaim"][
+                                    "claimName"
+                                ]
+                                pvc_mounts.append(
+                                    {
+                                        "container_name": container["name"],
+                                        "mount_path": mount["mountPath"],
+                                        "pvc_name": claim_name if claim_name else None,
+                                    }
+                                )
                             except KeyError as e:
                                 # Handle the KeyError (e.g., log the error, skip this iteration, etc.)
                                 print(f"KeyError: {e}. Skipping this entry.")
                             except IndexError as e:
                                 # Handle the IndexError (e.g., log the error, skip this iteration, etc.)
                                 print(f"IndexError: {e}. Skipping this entry.")
 
-
-        all_mounts = [mount.get('mount_path') for mount in pvc_mounts]
+        all_mounts = [mount.get("mount_path") for mount in pvc_mounts]
         all_mounts = " ".join(all_mounts).strip()
         for mount in pvc_mounts:
-            container_name = mount['container_name']
-            mount_path = mount['mount_path']
-            pvc_name = mount['pvc_name']
-            all_pvcs.append({"pvc_name": pvc_name, "mount_path": mount_path, "used": None, "capacity": None})
+            container_name = mount["container_name"]
+            mount_path = mount["mount_path"]
+            pvc_name = mount["pvc_name"]
+            all_pvcs.append(
+                {
+                    "pvc_name": pvc_name,
+                    "mount_path": mount_path,
+                    "used": None,
+                    "capacity": None,
+                }
+            )
 
             du_command = f"kubectl exec -n {namespace} {pod_name} -c {container_name} -- df -kh {all_mounts} | grep -v Filesystem"
             du_output = handle.run_native_cmd(du_command)
-                    
+
             if du_output and not du_output.stderr:
-                used_space = du_output.stdout.strip()
-                for idx, space in enumerate([used_space]):
-                    space = space.split()
-                    used_percentage = int(space[-2].replace('%', ''))
-                    total_capacity_str = space[1].replace('%', '')
-                    all_pvcs[idx]["used"] = used_percentage
-                    all_pvcs[idx]["capacity"] =  total_capacity_str
+                # Process each line of df output separately
+                df_lines = du_output.stdout.strip().split("\n")
+
+                for df_line in df_lines:
+                    if not df_line.strip():
+                        continue
+
+                    # Split line into columns
+                    columns = re.split(r"\s+", df_line.strip())
+
+                    # Find the percentage column (contains '%')
+                    percent_col = None
+                    for i, col in enumerate(columns):
+                        if "%" in col:
+                            percent_col = i
+                            break
+
+                    if percent_col is None or len(columns) < 2:
+                        print(f"Warning: Unexpected df output format: {df_line}")
+                        continue
+
+                    # Extract percentage and capacity
+                    used_percentage = int(columns[percent_col].replace("%", ""))
+                    total_capacity = columns[1] if len(columns) > 1 else "Unknown"
+                    pvc_info = {
+                        "pvc_name": pvc_name,
+                        "mount_path": mount_path,
+                        "used": used_percentage,
+                        "capacity": total_capacity,
+                    }
+
+                    # Check if usage exceeds threshold
                     if used_percentage > threshold:
-                        alert_pvcs.append(all_pvcs[idx])
+                        alert_pvcs.append(pvc_info)
 
         alert_pvcs_all_services.extend(alert_pvcs)
+
     if services_without_pvcs:
         print("Following services do not have any PVCs attached:")
         for service in services_without_pvcs:
             print(f"- {service}")
 
-    return (not bool(alert_pvcs_all_services), alert_pvcs_all_services)
+    return (not bool(alert_pvcs_all_services), alert_pvcs_all_services)