Skip to content

Commit 8b3686d

Browse files
Pvc utilization refactor (#1133)
1 parent 10f3625 commit 8b3686d

File tree

2 files changed

+138
-68
lines changed

2 files changed

+138
-68
lines changed

.github/workflows/all_module_test.yml

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ jobs:
2525

2626
- uses: actions/checkout@v3
2727

28+
- name: Set up Python 3.11
29+
uses: actions/setup-python@v4
30+
with:
31+
python-version: '3.11'
32+
2833
- name: Configure AWS Credentials
2934
uses: aws-actions/configure-aws-credentials@v1
3035
with:
@@ -36,18 +41,29 @@ jobs:
3641
- name: Install system dependencies
3742
run: |
3843
pip install shyaml
39-
pip install --upgrade pip==22.2.2
44+
pip install --upgrade pip
4045
sudo apt update
4146
sudo apt install -y wget
42-
sudo apt install -y awscli
47+
48+
# Install NumPy first with a compatible version
49+
pip install numpy>=1.22.0
50+
51+
# Install PyArrow with binary wheel - no build required
52+
pip install pyarrow --only-binary=pyarrow
53+
54+
# Continue with other dependencies
4355
aws s3 cp ${{ secrets.BUILD_REQUIREMENTS }} /tmp/requirements.txt
44-
pip install --no-cache-dir -r /tmp/requirements.txt
45-
aws s3 cp ${{ secrets.MAIN_MODULE_BUILD_PACKAGE }} /tmp/main_module.tar.gz
56+
pip install --no-cache-dir -r /tmp/requirements.txt || true
57+
58+
# Install main and sub modules
59+
aws s3 cp ${{ secrets.MAIN_MODULE_BUILD_PACKAGE }} /tmp/main_module.tar.gz
4660
pip install --no-cache-dir /tmp/main_module.tar.gz
47-
aws s3 cp ${{ secrets.SUB_MODULE_BUILD_PACKAGE }} /tmp/sub_module.tar.gz
61+
aws s3 cp ${{ secrets.SUB_MODULE_BUILD_PACKAGE }} /tmp/sub_module.tar.gz
4862
pip install --no-cache-dir /tmp/sub_module.tar.gz
63+
64+
# Additional dependencies
4965
pip install --no-cache-dir matplotlib>=3.7.1
66+
pip install setuptools wheel cython
5067
5168
- name: Run All Modules Check
52-
5369
run: /usr/bin/env python all_modules_test.py

Kubernetes/legos/k8s_check_service_pvc_utilization/k8s_check_service_pvc_utilization.py

Lines changed: 116 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,23 @@
88
from pydantic import BaseModel, Field
99
from kubernetes.client.rest import ApiException
1010

11+
1112
class InputSchema(BaseModel):
12-
namespace: str = Field(..., description='The namespace in which the service resides.', title='Namespace')
13-
core_services: list = Field(
14-
...,
15-
description='List of services for which the used PVC size needs to be checked.',
16-
title='K8s Sservice name',
17-
)
18-
threshold: Optional[int] = Field(
19-
80,
20-
description='Percentage threshold for utilized PVC disk size.E.g., a 80% threshold checks if the utilized space exceeds 80% of the total PVC capacity.',
21-
title='Threshold (in %)',
22-
)
13+
namespace: str = Field(
14+
...,
15+
description="The namespace in which the service resides.",
16+
title="Namespace",
17+
)
18+
core_services: list = Field(
19+
...,
20+
description="List of services for which the used PVC size needs to be checked.",
21+
title="K8s Service name",
22+
)
23+
threshold: Optional[int] = Field(
24+
80,
25+
description="Percentage threshold for utilized PVC disk size.E.g., a 80% threshold checks if the utilized space exceeds 80% of the total PVC capacity.",
26+
title="Threshold (in %)",
27+
)
2328

2429

2530
def k8s_check_service_pvc_utilization_printer(output):
@@ -28,13 +33,18 @@ def k8s_check_service_pvc_utilization_printer(output):
2833
if status:
2934
print("Disk sizes for all checked services are within the threshold.")
3035
else:
31-
print("ALERT: One or more PVC disk sizes are below the threshold:")
36+
print("ALERT: One or more PVC disk sizes are above threshold:")
3237
print("-" * 40)
3338
for pvc in pvc_info:
34-
print(f"PVC: {pvc['pvc_name']} - Utilized: {pvc['used']} of {pvc['capacity']}")
39+
print(
40+
f"PVC: {pvc['pvc_name']} - Utilized: {pvc['used']} of {pvc['capacity']}"
41+
)
3542
print("-" * 40)
3643

37-
def k8s_check_service_pvc_utilization(handle, core_services: list, namespace:str, threshold: int = 80) -> Tuple:
44+
45+
def k8s_check_service_pvc_utilization(
46+
handle, core_services: list, namespace: str, threshold: int = 80
47+
) -> Tuple:
3848
"""
3949
k8s_check_service_pvc_utilization checks the utilized disk size of a service's PVC against a given threshold.
4050
@@ -55,7 +65,7 @@ def k8s_check_service_pvc_utilization(handle, core_services: list, namespace:str
5565
:type namespace: str
5666
:param namespace: The namespace in which the service resides.
5767
58-
:return: Status and dictionary with PVC name and its size information if the PVC's disk size is below the threshold.
68+
:return: Status and dictionary with PVC name and its size information if the PVC's disk size exceeds threshold.
5969
"""
6070

6171
alert_pvcs_all_services = []
@@ -68,122 +78,166 @@ def k8s_check_service_pvc_utilization(handle, core_services: list, namespace:str
6878
if not response.stdout.strip():
6979
# No labels found for a particular service. Skipping...
7080
continue
71-
labels_dict = json.loads(response.stdout.replace("'", "\""))
81+
labels_dict = json.loads(response.stdout.replace("'", '"'))
7282
label_selector = ",".join([f"{k}={v}" for k, v in labels_dict.items()])
7383

7484
# Fetch the pod attached to this service.
7585
# The safer option is to try with the * option. Having a specific index like 0 or 1
76-
# will lead to ApiException.
86+
# will lead to ApiException.
7787
get_pod_command = f"kubectl get pods -n {namespace} -l {label_selector} -o=jsonpath='{{.items[*].metadata.name}}'"
7888
response = handle.run_native_cmd(get_pod_command)
7989
if not response or response.stderr:
80-
raise ApiException(f"Error while executing command ({get_pod_command}): {response.stderr if response else 'empty response'}")
90+
raise ApiException(
91+
f"Error while executing command ({get_pod_command}): {response.stderr if response else 'empty response'}"
92+
)
8193

8294
# pod_names stores the output from the above kubectl command, which is a list of pod_names separated by space
8395
pod_names = response.stdout.strip()
8496
if not pod_names:
8597
# No pods found for service {svc} in namespace {namespace} with labels {label_selector}
8698
continue
87-
99+
88100
# Fetch PVCs attached to the pod
89-
# The Above kubectl command would return a string that is space separated name(s) of the pod.
101+
# The Above kubectl command would return a string that is space separated name(s) of the pod.
90102
# Given such a string, lets find out if we have one or more than one pod name in the string.
91103
# If there are more than one pod name in the output, we need to iterate over all items[] array.
92-
# Else we can directly access the persistentVolumeClaim name
104+
# Else we can directly access the persistentVolumeClaim name
93105
# Lets also associate the pod_name along with the claim name (PVC Name) in the format of
94106
# pod_name:pv_claim_name
95-
107+
96108
if len(pod_names.split()) > 1:
97-
json_path_cmd = "{range .items[*]}{.metadata.name}:{range .spec.volumes[*].persistentVolumeClaim}{.claimName} {end}{\"\\n\"}{end}"
109+
json_path_cmd = '{range .items[*]}{.metadata.name}:{range .spec.volumes[*].persistentVolumeClaim}{.claimName} {end}{"\\n"}{end}'
98110
else:
99111
json_path_cmd = "{.metadata.name}:{range .spec.volumes[*].persistentVolumeClaim}{.claimName}{end}"
100112

101-
get_pvc_names_command = f"kubectl get pod {pod_names} -n {namespace} -o=jsonpath='{json_path_cmd}'"
102-
113+
get_pvc_names_command = (
114+
f"kubectl get pod {pod_names} -n {namespace} -o=jsonpath='{json_path_cmd}'"
115+
)
103116

104117
response = handle.run_native_cmd(get_pvc_names_command)
105118
if not response or response.stderr:
106-
raise ApiException(f"Error while executing command ({get_pvc_names_command}): {response.stderr if response else 'empty response'}")
119+
raise ApiException(
120+
f"Error while executing command ({get_pvc_names_command}): {response.stderr if response else 'empty response'}"
121+
)
107122
# Example: ['lightbeam-elasticsearch-master-0:data-lightbeam-elasticsearch-master-0']
108123
pod_and_pvc_names = response.stdout.strip().split()
109124

110-
111-
# The pod_and_pvc_names
125+
# The pod_and_pvc_names
112126
if not pod_and_pvc_names:
113127
services_without_pvcs.append(svc)
114128
continue
115129

116130
pvc_mounts = []
117131
alert_pvcs = []
118132
all_pvcs = []
119-
133+
120134
for element in pod_and_pvc_names:
121-
pod_name, claim_name = element.split(':')
135+
pod_name, claim_name = element.split(":")
122136
if not claim_name:
123137
# Skip if Volume Claim name is empty.
124-
continue
138+
continue
125139

126-
# Fetch the Pod JSON
140+
# Fetch the Pod JSON
127141
# We need to get the container name (if any) from the Pod's JSON. This is needed
128142
# if we want to exec into the POD that is within a container. The JSON data that
129143
# we obtain is used to fill the pvc_mounts list, which is a list of dictionaries.
130144
# We use this pvc_mounts to find out the used_space percentage. We compare that with
131-
# the threshold to flag if the utilization is above threshold.
145+
# the threshold to flag if the utilization is above threshold.
132146
# df -kh is the command used to get the disk utilization. This is accurate as we get
133147
# the disk utilization from the POD directly, rather than checking the resource limit
134-
# and resource request from the deployment / stateful YAML file.
148+
# and resource request from the deployment / stateful YAML file.
135149
get_pod_json_command = f"kubectl get pod {pod_name} -n {namespace} -o json"
136150
pod_json_output = handle.run_native_cmd(get_pod_json_command)
137151
if not pod_json_output or pod_json_output.stderr:
138-
raise ApiException(f"Error fetching pod json for {pod_name}: {pod_json_output.stderr if pod_json_output else 'empty response'}")
152+
raise ApiException(
153+
f"Error fetching pod json for {pod_name}: {pod_json_output.stderr if pod_json_output else 'empty response'}"
154+
)
139155
pod_data = json.loads(pod_json_output.stdout)
140-
156+
141157
# Dictionary .get() method with default value is way of error handling
142-
for container in pod_data.get('spec', {}).get('containers', {}):
143-
for mount in container.get('volumeMounts', {}):
144-
for volume in pod_data.get('spec', {}).get('volumes', {}):
145-
if 'persistentVolumeClaim' in volume and volume.get('name') == mount.get('name'):
158+
for container in pod_data.get("spec", {}).get("containers", {}):
159+
for mount in container.get("volumeMounts", {}):
160+
for volume in pod_data.get("spec", {}).get("volumes", {}):
161+
if "persistentVolumeClaim" in volume and volume.get(
162+
"name"
163+
) == mount.get("name"):
146164
try:
147-
claim_name = volume['persistentVolumeClaim']['claimName']
148-
pvc_mounts.append({
149-
"container_name": container['name'],
150-
"mount_path": mount['mountPath'],
151-
"pvc_name": claim_name if claim_name else None
152-
})
165+
claim_name = volume["persistentVolumeClaim"][
166+
"claimName"
167+
]
168+
pvc_mounts.append(
169+
{
170+
"container_name": container["name"],
171+
"mount_path": mount["mountPath"],
172+
"pvc_name": claim_name if claim_name else None,
173+
}
174+
)
153175
except KeyError as e:
154176
# Handle the KeyError (e.g., log the error, skip this iteration, etc.)
155177
print(f"KeyError: {e}. Skipping this entry.")
156178
except IndexError as e:
157179
# Handle the IndexError (e.g., log the error, skip this iteration, etc.)
158180
print(f"IndexError: {e}. Skipping this entry.")
159181

160-
161-
all_mounts = [mount.get('mount_path') for mount in pvc_mounts]
182+
all_mounts = [mount.get("mount_path") for mount in pvc_mounts]
162183
all_mounts = " ".join(all_mounts).strip()
163184
for mount in pvc_mounts:
164-
container_name = mount['container_name']
165-
mount_path = mount['mount_path']
166-
pvc_name = mount['pvc_name']
167-
all_pvcs.append({"pvc_name": pvc_name, "mount_path": mount_path, "used": None, "capacity": None})
185+
container_name = mount["container_name"]
186+
mount_path = mount["mount_path"]
187+
pvc_name = mount["pvc_name"]
188+
all_pvcs.append(
189+
{
190+
"pvc_name": pvc_name,
191+
"mount_path": mount_path,
192+
"used": None,
193+
"capacity": None,
194+
}
195+
)
168196

169197
du_command = f"kubectl exec -n {namespace} {pod_name} -c {container_name} -- df -kh {all_mounts} | grep -v Filesystem"
170198
du_output = handle.run_native_cmd(du_command)
171-
199+
172200
if du_output and not du_output.stderr:
173-
used_space = du_output.stdout.strip()
174-
for idx, space in enumerate([used_space]):
175-
space = space.split()
176-
used_percentage = int(space[-2].replace('%', ''))
177-
total_capacity_str = space[1].replace('%', '')
178-
all_pvcs[idx]["used"] = used_percentage
179-
all_pvcs[idx]["capacity"] = total_capacity_str
201+
# Process each line of df output separately
202+
df_lines = du_output.stdout.strip().split("\n")
203+
204+
for df_line in df_lines:
205+
if not df_line.strip():
206+
continue
207+
208+
# Split line into columns
209+
columns = re.split(r"\s+", df_line.strip())
210+
211+
# Find the percentage column (contains '%')
212+
percent_col = None
213+
for i, col in enumerate(columns):
214+
if "%" in col:
215+
percent_col = i
216+
break
217+
218+
if percent_col is None or len(columns) < 2:
219+
print(f"Warning: Unexpected df output format: {df_line}")
220+
continue
221+
222+
# Extract percentage and capacity
223+
used_percentage = int(columns[percent_col].replace("%", ""))
224+
total_capacity = columns[1] if len(columns) > 1 else "Unknown"
225+
pvc_info = {
226+
"pvc_name": pvc_name,
227+
"mount_path": mount_path,
228+
"used": used_percentage,
229+
"capacity": total_capacity,
230+
}
231+
232+
# Check if usage exceeds threshold
180233
if used_percentage > threshold:
181-
alert_pvcs.append(all_pvcs[idx])
234+
alert_pvcs.append(pvc_info)
182235

183236
alert_pvcs_all_services.extend(alert_pvcs)
237+
184238
if services_without_pvcs:
185239
print("Following services do not have any PVCs attached:")
186240
for service in services_without_pvcs:
187241
print(f"- {service}")
188242

189-
return (not bool(alert_pvcs_all_services), alert_pvcs_all_services)
243+
return (not bool(alert_pvcs_all_services), alert_pvcs_all_services)

0 commit comments

Comments
 (0)