diff --git a/cloud/azure/README.md b/cloud/azure/README.md new file mode 100644 index 0000000000..3d1e9a6bc7 --- /dev/null +++ b/cloud/azure/README.md @@ -0,0 +1,14 @@ +1. In the Azure portal, search for **Function App** (make sure the subscription is set to **eng-cloud-dev**). + In the list, find the Function App named **DeleteOrpanedK8sResources**. + +2. Open this Function App and select the function **aks-cleanup-function**. + +3. To update this function, modify the code **locally** and then **redeploy** it to Azure. + +# To redeploy function run in jenkins-pipelines/cloud/azure/cmd folder: +`` +zip -r ../aks-cleanup.zip . -x "local.settings.json" ".funcignore" "**/__pycache__/*" ".git/*" ".venv/*" + +az functionapp deployment source config-zip --resource-group percona-operators --name DeleteOrpanedK8sResources --src ../aks-cleanup.zip + +`` diff --git a/cloud/azure/cmd/.funcignore b/cloud/azure/cmd/.funcignore new file mode 100644 index 0000000000..41740c9736 --- /dev/null +++ b/cloud/azure/cmd/.funcignore @@ -0,0 +1,6 @@ +.venv/ +.env +__pycache__/ +.local/ +bin/ +obj/ diff --git a/cloud/azure/cmd/aks-cleanup-function/__init__.py b/cloud/azure/cmd/aks-cleanup-function/__init__.py new file mode 100644 index 0000000000..704cc4d799 --- /dev/null +++ b/cloud/azure/cmd/aks-cleanup-function/__init__.py @@ -0,0 +1,169 @@ +# Remove expired AKS clusters (Azure, cluster-only) + +import os +import math +import logging +import datetime +import time +import azure.functions as func +from typing import List, Dict, Optional + +from azure.identity import DefaultAzureCredential +from azure.mgmt.resource import ResourceManagementClient +from azure.mgmt.containerservice import ContainerServiceClient +from azure.core.exceptions import ResourceNotFoundError, HttpResponseError + +DRY_RUN = os.getenv("DRY_RUN", "true").lower() == "true" + +credential: Optional[DefaultAzureCredential] = None +resource_groups_client: Optional[ResourceManagementClient] = None +aks_client: Optional[ContainerServiceClient] = None + +# Resolve RG for a cluster name +CLUSTER_RG_MAP: Dict[str, str] = {} + + +def parse_epoch_creation_time(tags: dict) -> Optional[datetime.datetime]: + """Try parse tags['creation-time'] (epoch seconds) into aware datetime UTC.""" + raw = (tags or {}).get("creation-time") + if not raw: + return None + try: + ts = float(raw) + return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc) + except Exception: + logging.warning("Invalid creation-time tag: %r", raw) + return None + + +def is_cluster_to_terminate(cluster) -> bool: + """ + Delete rules: + - requires tag team=cloud (case-insensitive) + - if TTL tag missing -> True (delete by policy) + - else TTL must be an integer number of hours + - delete when (now - creation-time[tag]) in hours > TTL + - if TTL present but creation-time missing/invalid -> safe skip + """ + tags = cluster.tags or {} + name = getattr(cluster, "name", "") + logging.info("Cluster %s tags: %s", name, tags) + + if tags.get("team", "").lower() != "cloud": + return False + + ttl_hours = tags.get("delete-cluster-after-hours") + if ttl_hours is None: + logging.info("Cluster %s has no TTL tag — marked for deletion by policy", name) + return True + + created_at = parse_epoch_creation_time(tags) + logging.info("Cluster %s created_at: %s", cluster.name, created_at) + if created_at is None: + logging.info("Cluster %s has TTL but no valid creation-time tag — skipping", name) + return False + now = datetime.datetime.now(datetime.timezone.utc) + lifetime_hours = int(math.ceil((now - created_at).total_seconds() / 3600.0)) + + return lifetime_hours > int(ttl_hours) + + +def get_clusters_to_terminate() -> List[str]: + """ + Scan all resource groups, return cluster names to delete. + Also populate CLUSTER_RG_MAP[name] = rg for later deletion. + """ + clusters_for_deletion: List[str] = [] + CLUSTER_RG_MAP.clear() + + for rg in resource_groups_client.resource_groups.list(): + rg_name = rg.name + try: + for mc in aks_client.managed_clusters.list_by_resource_group(rg_name): + if is_cluster_to_terminate(mc): + clusters_for_deletion.append(mc.name) + CLUSTER_RG_MAP[mc.name] = rg_name + except HttpResponseError as e: + logging.warning("Failed to list AKS in RG %s: %s", rg_name, e) + + if not clusters_for_deletion: + logging.info("There are no clusters for deletion") + return clusters_for_deletion + + +def wait_for_cluster_delete(cluster_name: str, timeout: int = 300, sleep_time: int = 10): + """Poll until the AKS cluster disappears (or timeout).""" + attempts = timeout // sleep_time + for attempt in range(attempts): + rg_name = CLUSTER_RG_MAP.get(cluster_name) + if not rg_name: + logging.info("Cluster %s RG mapping missing; assuming deleted", cluster_name) + return + try: + _ = aks_client.managed_clusters.get(rg_name, cluster_name) + logging.info( + "Cluster %s still exists. Attempt %d/%d. Sleeping %ds.", + cluster_name, attempt + 1, attempts, sleep_time + ) + time.sleep(sleep_time) + except ResourceNotFoundError: + logging.info("Cluster %s was successfully deleted.", cluster_name) + return + except HttpResponseError as e: + status = getattr(e, "status_code", None) + if status == 404 or "NotFound" in str(e) or "404" in str(e): + logging.info("Cluster %s was successfully deleted.", cluster_name) + return + logging.warning("Error checking cluster %s: %s", cluster_name, e) + time.sleep(sleep_time) + logging.error("Cluster %s was not deleted in %d seconds.", cluster_name, timeout) + + +def delete_cluster(cluster_name: str): + """ + Resolve RG from CLUSTER_RG_MAP (or scan), then delete the AKS cluster. + """ + rg_name = CLUSTER_RG_MAP.get(cluster_name) + if not rg_name: + # Slow path: try to resolve by scanning RGs + for rg in resource_groups_client.resource_groups.list(): + try: + _ = aks_client.managed_clusters.get(rg.name, cluster_name) + rg_name = rg.name + CLUSTER_RG_MAP[cluster_name] = rg_name + break + except Exception: + continue + + if not rg_name: + logging.info("Cluster %s not found — skipping", cluster_name) + return + + if DRY_RUN: + logging.info("[DRY-RUN] Would delete cluster %s/%s", rg_name, cluster_name) + return + + aks_client.managed_clusters.begin_delete(rg_name, cluster_name) + wait_for_cluster_delete(cluster_name) + + +def main(mytimer: func.TimerRequest) -> None: + + global credential, resource_groups_client, aks_client + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID") + if not subscription_id: + logging.error("AZURE_SUBSCRIPTION_ID is not set") + return + + credential = DefaultAzureCredential() + resource_groups_client = ResourceManagementClient(credential, subscription_id) + aks_client = ContainerServiceClient(credential, subscription_id) + + logging.info("Searching for AKS clusters to remove.") + clusters = get_clusters_to_terminate() + for cluster in clusters: + logging.info("Terminating %s", cluster) + delete_cluster(cluster) diff --git a/cloud/azure/cmd/aks-cleanup-function/function.json b/cloud/azure/cmd/aks-cleanup-function/function.json new file mode 100644 index 0000000000..53551154e0 --- /dev/null +++ b/cloud/azure/cmd/aks-cleanup-function/function.json @@ -0,0 +1,12 @@ +{ + "scriptFile": "__init__.py", + "entryPoint": "main", + "bindings": [ + { + "name": "mytimer", + "type": "timerTrigger", + "direction": "in", + "schedule": "0 0 * * * *" + } + ] +} \ No newline at end of file diff --git a/cloud/azure/cmd/host.json b/cloud/azure/cmd/host.json new file mode 100644 index 0000000000..5ce595737f --- /dev/null +++ b/cloud/azure/cmd/host.json @@ -0,0 +1 @@ +{ "version": "2.0" } diff --git a/cloud/azure/cmd/local.settings.json b/cloud/azure/cmd/local.settings.json new file mode 100644 index 0000000000..be1644756d --- /dev/null +++ b/cloud/azure/cmd/local.settings.json @@ -0,0 +1,10 @@ +{ + "IsEncrypted": false, + "Values": { + "AzureWebJobsStorage": "UseDevelopmentStorage=true", + "FUNCTIONS_WORKER_RUNTIME": "python", + "DRY_RUN": "true", + "SLEEP_BETWEEN_DELETES_SECONDS": "0.2", + "DELETE_START_MAX_RETRIES": "3" + } +} \ No newline at end of file diff --git a/cloud/azure/cmd/requirements.txt b/cloud/azure/cmd/requirements.txt new file mode 100644 index 0000000000..79e6a4d781 --- /dev/null +++ b/cloud/azure/cmd/requirements.txt @@ -0,0 +1,5 @@ +azure-functions>=1.18.0 +azure-identity>=1.17.1 +azure-mgmt-containerservice>=31.0.0 +azure-mgmt-resource>=23.1.1 +azure-core>=1.30.0 diff --git a/cloud/jenkins/pgo_aks.groovy b/cloud/jenkins/pgo_aks.groovy index 27850c4296..109fafc4ed 100644 --- a/cloud/jenkins/pgo_aks.groovy +++ b/cloud/jenkins/pgo_aks.groovy @@ -220,6 +220,7 @@ void createCluster(String CLUSTER_SUFFIX) { --enable-cluster-autoscaler \ --outbound-type loadbalancer \ --kubernetes-version $PLATFORM_VER \ + --tags team=cloud delete-cluster-after-hours=6 creation-time=$(date -u +%s) \ -l $location az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """ diff --git a/cloud/jenkins/psmdbo_aks.groovy b/cloud/jenkins/psmdbo_aks.groovy index e45f102830..16e9cc8563 100644 --- a/cloud/jenkins/psmdbo_aks.groovy +++ b/cloud/jenkins/psmdbo_aks.groovy @@ -243,6 +243,7 @@ void createCluster(String CLUSTER_SUFFIX) { --enable-cluster-autoscaler \ --outbound-type loadbalancer \ --kubernetes-version $PLATFORM_VER \ + --tags team=cloud delete-cluster-after-hours=6 creation-time=$(date -u +%s) \ -l $location az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """ diff --git a/cloud/jenkins/pxco_aks.groovy b/cloud/jenkins/pxco_aks.groovy index 59ba60d701..ac91bbe6e0 100644 --- a/cloud/jenkins/pxco_aks.groovy +++ b/cloud/jenkins/pxco_aks.groovy @@ -212,6 +212,7 @@ void createCluster(String CLUSTER_SUFFIX) { --enable-cluster-autoscaler \ --outbound-type loadbalancer \ --kubernetes-version $PLATFORM_VER \ + --tags team=cloud delete-cluster-after-hours=6 creation-time=$(date -u +%s) \ -l $location az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """