diff --git a/dev-infrastructure/Makefile.sre-tooling b/dev-infrastructure/Makefile.sre-tooling new file mode 100644 index 0000000000..3ab078f1dc --- /dev/null +++ b/dev-infrastructure/Makefile.sre-tooling @@ -0,0 +1,142 @@ +# +# SRE Tooling AKS Cluster +# Standalone Makefile - does not require main Makefile +# +# Environment variables required: +# SRE_TOOLING_ENV: dev or pers +# SRE_TOOLING_RG: Resource group name (e.g., hcp-dev-sre-tooling or hcp-pers-sre-tooling) +# SRE_TOOLING_SUBSCRIPTION_ID: Subscription ID +# SERVICE_KEYVAULT_NAME: Name of existing service key vault +# SERVICE_KEYVAULT_RG: Resource group of service key vault +# REGIONAL_RG: Regional resource group name +# SVC_ACR_RESOURCE_ID: Resource ID of SVC ACR +# GLOBAL_MSI_ID: Resource ID of global MSI +# KV_CERT_OFFICER_PRINCIPAL_ID: Principal ID for KV certificate officer +# AZURE_MONITORING_WORKSPACE_ID: Resource ID of Azure Monitor Workspace (optional) +# ADMIN_API_MI_NAME: Name of Admin API managed identity +# +# Usage: make -f Makefile.sre-tooling +# + +# Set SKIP_CONFIRM to a non-empty value to skip "what-if" confirmation prompts. +ifndef SKIP_CONFIRM +PROMPT_TO_CONFIRM = "--confirm-with-what-if" +endif + +SRE_TOOLING_ENVS = dev pers + +sre-tooling-infra: + @[ "${SRE_TOOLING_ENV}" ] || ( echo ">> SRE_TOOLING_ENV is not set (dev or pers)"; exit 1 ) + @[ "${SRE_TOOLING_ENV}" = "dev" ] || [ "${SRE_TOOLING_ENV}" = "pers" ] || ( echo ">> SRE_TOOLING_ENV must be 'dev' or 'pers', got: ${SRE_TOOLING_ENV}"; exit 1 ) + @[ "${SRE_TOOLING_RG}" ] || ( echo ">> SRE_TOOLING_RG is not set"; exit 1 ) + @[ "${SRE_TOOLING_SUBSCRIPTION_ID}" ] || ( echo ">> SRE_TOOLING_SUBSCRIPTION_ID is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_NAME}" ] || ( echo ">> SERVICE_KEYVAULT_NAME is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_RG}" ] || ( echo ">> SERVICE_KEYVAULT_RG is not set"; exit 1 ) + @[ "${GLOBAL_MSI_ID}" ] || ( echo ">> GLOBAL_MSI_ID is not set"; exit 1 ) + @[ "${KV_CERT_OFFICER_PRINCIPAL_ID}" ] || ( echo ">> KV_CERT_OFFICER_PRINCIPAL_ID is not set"; exit 1 ) + az group create \ + --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} \ + --location westus3 --tags persist=true environment=${SRE_TOOLING_ENV} || true + az deployment group create \ + --name sre-tooling-infra-${SRE_TOOLING_ENV} \ + --resource-group ${SRE_TOOLING_RG} \ + --mode complete \ + --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} \ + --template-file templates/sre-tooling-infra.bicep \ + $(PROMPT_TO_CONFIRM) \ + --parameters configurations/sre-tooling-infra.bicepparam \ + --parameters serviceKeyVaultName=${SERVICE_KEYVAULT_NAME} \ + --parameters serviceKeyVaultResourceGroup=${SERVICE_KEYVAULT_RG} \ + --parameters globalMSIId=${GLOBAL_MSI_ID} \ + --parameters kvCertOfficerPrincipalId=${KV_CERT_OFFICER_PRINCIPAL_ID} \ + --parameters serviceKeyVaultTagValue=${SRE_TOOLING_ENV} +.PHONY: sre-tooling-infra + +sre-tooling-infra.what-if: + @[ "${SRE_TOOLING_ENV}" ] || ( echo ">> SRE_TOOLING_ENV is not set (dev or pers)"; exit 1 ) + @[ "${SRE_TOOLING_ENV}" = "dev" ] || [ "${SRE_TOOLING_ENV}" = "pers" ] || ( echo ">> SRE_TOOLING_ENV must be 'dev' or 'pers', got: ${SRE_TOOLING_ENV}"; exit 1 ) + @[ "${SRE_TOOLING_RG}" ] || ( echo ">> SRE_TOOLING_RG is not set"; exit 1 ) + @[ "${SRE_TOOLING_SUBSCRIPTION_ID}" ] || ( echo ">> SRE_TOOLING_SUBSCRIPTION_ID is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_NAME}" ] || ( echo ">> SERVICE_KEYVAULT_NAME is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_RG}" ] || ( echo ">> SERVICE_KEYVAULT_RG is not set"; exit 1 ) + @[ "${GLOBAL_MSI_ID}" ] || ( echo ">> GLOBAL_MSI_ID is not set"; exit 1 ) + @[ "${KV_CERT_OFFICER_PRINCIPAL_ID}" ] || ( echo ">> KV_CERT_OFFICER_PRINCIPAL_ID is not set"; exit 1 ) + az deployment group what-if \ + --name sre-tooling-infra-${SRE_TOOLING_ENV} \ + --resource-group ${SRE_TOOLING_RG} \ + --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} \ + --template-file templates/sre-tooling-infra.bicep \ + --parameters configurations/sre-tooling-infra.bicepparam \ + --parameters serviceKeyVaultName=${SERVICE_KEYVAULT_NAME} \ + --parameters serviceKeyVaultResourceGroup=${SERVICE_KEYVAULT_RG} \ + --parameters globalMSIId=${GLOBAL_MSI_ID} \ + --parameters kvCertOfficerPrincipalId=${KV_CERT_OFFICER_PRINCIPAL_ID} \ + --parameters serviceKeyVaultTagValue=${SRE_TOOLING_ENV} +.PHONY: sre-tooling-infra.what-if + +sre-tooling-cluster: + @[ "${SRE_TOOLING_ENV}" ] || ( echo ">> SRE_TOOLING_ENV is not set (dev or pers)"; exit 1 ) + @[ "${SRE_TOOLING_ENV}" = "dev" ] || [ "${SRE_TOOLING_ENV}" = "pers" ] || ( echo ">> SRE_TOOLING_ENV must be 'dev' or 'pers', got: ${SRE_TOOLING_ENV}"; exit 1 ) + @[ "${SRE_TOOLING_RG}" ] || ( echo ">> SRE_TOOLING_RG is not set"; exit 1 ) + @[ "${SRE_TOOLING_SUBSCRIPTION_ID}" ] || ( echo ">> SRE_TOOLING_SUBSCRIPTION_ID is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_NAME}" ] || ( echo ">> SERVICE_KEYVAULT_NAME is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_RG}" ] || ( echo ">> SERVICE_KEYVAULT_RG is not set"; exit 1 ) + @[ "${REGIONAL_RG}" ] || ( echo ">> REGIONAL_RG is not set"; exit 1 ) + @[ "${SVC_ACR_RESOURCE_ID}" ] || ( echo ">> SVC_ACR_RESOURCE_ID is not set"; exit 1 ) + @[ "${GLOBAL_MSI_ID}" ] || ( echo ">> GLOBAL_MSI_ID is not set"; exit 1 ) + @[ "${ADMIN_API_MI_NAME}" ] || ( echo ">> ADMIN_API_MI_NAME is not set"; exit 1 ) + @$(eval DEFAULT_CLUSTER_NAME = $(if $(filter pers,${SRE_TOOLING_ENV}),pers-westus3-sre-tooling,sre-tooling-aks)) + @$(eval AKS_CLUSTER_NAME = $(or ${AKS_CLUSTER_NAME},${DEFAULT_CLUSTER_NAME})) + @echo "Using cluster name: ${AKS_CLUSTER_NAME}" + az deployment group create \ + --name sre-tooling-cluster-${SRE_TOOLING_ENV} \ + --resource-group ${SRE_TOOLING_RG} \ + --mode complete \ + --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} \ + --template-file templates/sre-tooling-cluster.bicep \ + $(PROMPT_TO_CONFIRM) \ + --parameters configurations/sre-tooling-cluster.bicepparam \ + --parameters serviceKeyVaultName=${SERVICE_KEYVAULT_NAME} \ + --parameters serviceKeyVaultResourceGroup=${SERVICE_KEYVAULT_RG} \ + --parameters regionalResourceGroup=${REGIONAL_RG} \ + --parameters svcAcrResourceId=${SVC_ACR_RESOURCE_ID} \ + --parameters globalMSIId=${GLOBAL_MSI_ID} \ + --parameters adminApiMIName=${ADMIN_API_MI_NAME} \ + --parameters aksKeyVaultName=sre-tooling-${SRE_TOOLING_ENV}-etcd-kv \ + --parameters aksKeyVaultTagValue=${SRE_TOOLING_ENV} \ + --parameters aksClusterName=${AKS_CLUSTER_NAME} \ + $(if $(AZURE_MONITORING_WORKSPACE_ID),--parameters azureMonitoringWorkspaceId=${AZURE_MONITORING_WORKSPACE_ID}) +.PHONY: sre-tooling-cluster + +sre-tooling-cluster.what-if: + @[ "${SRE_TOOLING_ENV}" ] || ( echo ">> SRE_TOOLING_ENV is not set (dev or pers)"; exit 1 ) + @[ "${SRE_TOOLING_ENV}" = "dev" ] || [ "${SRE_TOOLING_ENV}" = "pers" ] || ( echo ">> SRE_TOOLING_ENV must be 'dev' or 'pers', got: ${SRE_TOOLING_ENV}"; exit 1 ) + @[ "${SRE_TOOLING_RG}" ] || ( echo ">> SRE_TOOLING_RG is not set"; exit 1 ) + @[ "${SRE_TOOLING_SUBSCRIPTION_ID}" ] || ( echo ">> SRE_TOOLING_SUBSCRIPTION_ID is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_NAME}" ] || ( echo ">> SERVICE_KEYVAULT_NAME is not set"; exit 1 ) + @[ "${SERVICE_KEYVAULT_RG}" ] || ( echo ">> SERVICE_KEYVAULT_RG is not set"; exit 1 ) + @[ "${REGIONAL_RG}" ] || ( echo ">> REGIONAL_RG is not set"; exit 1 ) + @[ "${SVC_ACR_RESOURCE_ID}" ] || ( echo ">> SVC_ACR_RESOURCE_ID is not set"; exit 1 ) + @[ "${GLOBAL_MSI_ID}" ] || ( echo ">> GLOBAL_MSI_ID is not set"; exit 1 ) + @[ "${ADMIN_API_MI_NAME}" ] || ( echo ">> ADMIN_API_MI_NAME is not set"; exit 1 ) + @$(eval DEFAULT_CLUSTER_NAME = $(if $(filter pers,${SRE_TOOLING_ENV}),pers-westus3-sre-tooling,sre-tooling-aks)) + @$(eval AKS_CLUSTER_NAME = $(or ${AKS_CLUSTER_NAME},${DEFAULT_CLUSTER_NAME})) + @echo "Using cluster name: ${AKS_CLUSTER_NAME}" + az deployment group what-if \ + --name sre-tooling-cluster-${SRE_TOOLING_ENV} \ + --resource-group ${SRE_TOOLING_RG} \ + --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} \ + --template-file templates/sre-tooling-cluster.bicep \ + --parameters configurations/sre-tooling-cluster.bicepparam \ + --parameters serviceKeyVaultName=${SERVICE_KEYVAULT_NAME} \ + --parameters serviceKeyVaultResourceGroup=${SERVICE_KEYVAULT_RG} \ + --parameters regionalResourceGroup=${REGIONAL_RG} \ + --parameters svcAcrResourceId=${SVC_ACR_RESOURCE_ID} \ + --parameters globalMSIId=${GLOBAL_MSI_ID} \ + --parameters adminApiMIName=${ADMIN_API_MI_NAME} \ + --parameters aksKeyVaultName=sre-tooling-${SRE_TOOLING_ENV}-etcd-kv \ + --parameters aksKeyVaultTagValue=${SRE_TOOLING_ENV} \ + --parameters aksClusterName=${AKS_CLUSTER_NAME} \ + $(if $(AZURE_MONITORING_WORKSPACE_ID),--parameters azureMonitoringWorkspaceId=${AZURE_MONITORING_WORKSPACE_ID}) +.PHONY: sre-tooling-cluster.what-if + diff --git a/dev-infrastructure/configurations/sre-tooling-cluster.bicepparam b/dev-infrastructure/configurations/sre-tooling-cluster.bicepparam new file mode 100644 index 0000000000..5a248dfd0c --- /dev/null +++ b/dev-infrastructure/configurations/sre-tooling-cluster.bicepparam @@ -0,0 +1,66 @@ +using '../templates/sre-tooling-cluster.bicep' + +// Location +param location = 'westus3' + +// AKS Cluster +// Note: This will be overridden by Makefile based on SRE_TOOLING_ENV +// Default: 'sre-tooling-aks' for dev, 'pers-westus3-sre-tooling' for pers +param aksClusterName = 'sre-tooling-aks' +param kubernetesVersion = '1.32' +param vnetAddressPrefix = '10.0.0.0/16' +param subnetPrefix = '10.0.0.0/24' +param podSubnetPrefix = '10.0.1.0/24' + +// System Agent Pool +param systemAgentMinCount = 2 +param systemAgentMaxCount = 3 +param systemAgentPoolCount = 1 +param systemAgentPoolZones = '1,2,3' +param systemAgentVMSize = 'Standard_D2s_v3' +param systemZoneRedundantMode = 'Zone' +param aksSystemOsDiskSizeGB = 32 + +// User Agent Pool +param userAgentMinCount = 1 +param userAgentMaxCount = 3 +param userAgentVMSize = 'Standard_D2s_v3' +param userAgentPoolCount = 1 +param userAgentPoolZones = '1,2,3' +param userZoneRedundantMode = 'Zone' +param userOsDiskSizeGB = 32 + +// Infra Agent Pool (for Prometheus) +param infraAgentMinCount = 1 +param infraAgentMaxCount = 2 +param infraAgentVMSize = 'Standard_D4s_v3' +param infraAgentPoolCount = 1 +param infraAgentPoolZones = '1,2,3' +param infraZoneRedundantMode = 'Zone' +param infraOsDiskSizeGB = 64 + +// Network +param aksNetworkDataplane = 'azure' +param aksNetworkPolicy = 'azure' + +// Key Vault for AKS etcd +param aksKeyVaultName = '' +param aksKeyVaultTagName = 'aro-hcp-environment' +param aksKeyVaultTagValue = 'dev' +param aksEtcdKVEnableSoftDelete = true +param aksClusterOutboundIPAddressIPTags = '' + +// These will be overridden via command line +param svcAcrResourceId = '' +param serviceKeyVaultName = '' +param serviceKeyVaultResourceGroup = '' +param regionalResourceGroup = '' +param globalMSIId = '' +param azureMonitoringWorkspaceId = '' +param logsNamespace = 'logs' +param logsMSI = 'logs-msi' +param logsServiceAccount = 'logs-service-account' +param adminApiMIName = '' +param adminApiNamespace = 'admin-api' +param adminApiServiceAccountName = 'admin-api-service-account' + diff --git a/dev-infrastructure/configurations/sre-tooling-infra.bicepparam b/dev-infrastructure/configurations/sre-tooling-infra.bicepparam new file mode 100644 index 0000000000..67dfa266ed --- /dev/null +++ b/dev-infrastructure/configurations/sre-tooling-infra.bicepparam @@ -0,0 +1,13 @@ +using '../templates/sre-tooling-infra.bicep' + +// These will be overridden via command line in Makefile +param serviceKeyVaultName = '' +param serviceKeyVaultResourceGroup = '' +param serviceKeyVaultLocation = 'westus3' +param serviceKeyVaultSoftDelete = true +param serviceKeyVaultPrivate = true +param serviceKeyVaultTagName = 'aro-hcp-environment' +param serviceKeyVaultTagValue = 'dev' +param globalMSIId = '' +param kvCertOfficerPrincipalId = '' + diff --git a/dev-infrastructure/templates/output-sre-tooling-cluster.bicep b/dev-infrastructure/templates/output-sre-tooling-cluster.bicep new file mode 100644 index 0000000000..142aa01ccb --- /dev/null +++ b/dev-infrastructure/templates/output-sre-tooling-cluster.bicep @@ -0,0 +1,42 @@ +import { safeTake } from '../modules/common.bicep' + +@description('Azure Region Location') +param location string = resourceGroup().location + +@description('AKS cluster name') +param aksClusterName string + +@description('The managed identity name of the logs') +param logsMSI string + +@description('The name of the Admin API managed identity') +param adminApiMIName string + +var dceName = safeTake('MSProm-${location}-${aksClusterName}', 44) +var dcrName = safeTake('MSProm-${location}-${aksClusterName}', 44) + +resource dce 'Microsoft.Insights/dataCollectionEndpoints@2022-06-01' existing = { + name: dceName +} + +resource dcr 'Microsoft.Insights/dataCollectionRules@2022-06-01' existing = { + name: dcrName +} + +resource prometheusUAMI 'Microsoft.ManagedIdentity/userAssignedIdentities@2024-11-30' existing = { + name: 'prometheus' +} + +resource logsUAMI 'Microsoft.ManagedIdentity/userAssignedIdentities@2024-11-30' existing = { + name: logsMSI +} + +resource adminApiUAMI 'Microsoft.ManagedIdentity/userAssignedIdentities@2024-11-30' existing = { + name: adminApiMIName +} + +output dcrRemoteWriteUrl string = '${dce.properties.metricsIngestion.endpoint}/dataCollectionRules/${dcr.properties.immutableId}/streams/Microsoft-PrometheusMetrics/api/v1/write?api-version=2023-04-24' +output hcpDcrRemoteWriteUrl string = 'NONE' +output prometheusUAMIClientId string = prometheusUAMI.properties.clientId +output clusterLogPrincipalId string = logsUAMI.properties.principalId +output adminApiPrincipalId string = adminApiUAMI.properties.principalId diff --git a/dev-infrastructure/templates/sre-tooling-cluster.bicep b/dev-infrastructure/templates/sre-tooling-cluster.bicep new file mode 100644 index 0000000000..83cca73d98 --- /dev/null +++ b/dev-infrastructure/templates/sre-tooling-cluster.bicep @@ -0,0 +1,319 @@ +import { + csvToArray + getLocationAvailabilityZonesCSV +} from '../modules/common.bicep' +import * as mi from '../modules/managed-identities.bicep' + +@description('Azure Region Location') +param location string = resourceGroup().location + +@description('Availability Zones to use for the infrastructure, as a CSV string. Defaults to all the zones of the location') +param locationAvailabilityZones string = getLocationAvailabilityZonesCSV(location) +var locationAvailabilityZoneList = csvToArray(locationAvailabilityZones) + +@description('AKS cluster name') +param aksClusterName string + +@description('Minimum node count for system agent pool') +param systemAgentMinCount int + +@description('Maximum node count for system agent pool') +param systemAgentMaxCount int + +@description('VM instance type for the system nodes') +param systemAgentVMSize string + +@description('Number of pools to create for system nodes') +param systemAgentPoolCount int + +@description('Zones to use for the system nodes') +param systemAgentPoolZones string + +@description('Zone redundant mode for the system nodes') +param systemZoneRedundantMode string + +@description('Disk size for the AKS system nodes') +param aksSystemOsDiskSizeGB int + +@description('Disk size for the AKS user nodes') +param userOsDiskSizeGB int + +@description('Network dataplane plugin for the AKS cluster') +param aksNetworkDataplane string + +@description('Network policy plugin for the AKS cluster') +param aksNetworkPolicy string + +@description('Min replicas for the worker nodes') +param userAgentMinCount int + +@description('Max replicas for the worker nodes') +param userAgentMaxCount int + +@description('VM instance type for the worker nodes') +param userAgentVMSize string + +@description('Number of pools to create for user nodes') +param userAgentPoolCount int + +@description('Zones to use for the user nodes') +param userAgentPoolZones string + +@description('Zone redundant mode for the user nodes') +param userZoneRedundantMode string + +@description('Min replicas for the infra worker nodes') +param infraAgentMinCount int + +@description('Max replicas for the infra worker nodes') +param infraAgentMaxCount int + +@description('VM instance type for the infra worker nodes') +param infraAgentVMSize string + +@description('Number of pools to create for infra nodes') +param infraAgentPoolCount int + +@description('Zones to use for the infra nodes') +param infraAgentPoolZones string + +@description('Disk size for the AKS infra nodes') +param infraOsDiskSizeGB int + +@description('Zone redundant mode for the infra nodes') +param infraZoneRedundantMode string + +@description('The resource ID of the SVC ACR') +param svcAcrResourceId string + +@description('Name of the resource group for the AKS nodes') +param aksNodeResourceGroupName string = '${resourceGroup().name}-aks1' + +@description('VNET address prefix') +param vnetAddressPrefix string + +@description('Subnet address prefix') +param subnetPrefix string + +@description('Specifies the address prefix of the subnet hosting the pods of the AKS cluster.') +param podSubnetPrefix string + +@description('Kubernetes version to use with AKS') +param kubernetesVersion string + +@description('The name of the keyvault for AKS.') +@maxLength(24) +param aksKeyVaultName string + +@description('The tag key for the AKS keyvault') +param aksKeyVaultTagName string + +@description('The tag value for the AKS keyvault') +param aksKeyVaultTagValue string + +@description('Manage soft delete setting for AKS etcd key-value store') +param aksEtcdKVEnableSoftDelete bool = true + +@description('IPTags to be set on the cluster outbound IP address in the format of ipTagType:tag,ipTagType:tag') +param aksClusterOutboundIPAddressIPTags string = '' + +@description('The resourcegroup for regional infrastructure') +param regionalResourceGroup string + +@description('The name of the service keyvault') +param serviceKeyVaultName string + +@description('The name of the resourcegroup for the service keyvault') +param serviceKeyVaultResourceGroup string = resourceGroup().name + +@description('MSI that will be used to run the deploymentScript') +param globalMSIId string + +@description('The Azure Resource ID of the Azure Monitor Workspace (stores prometheus metrics)') +param azureMonitoringWorkspaceId string + +// logs +@description('The namespace of the logs') +param logsNamespace string + +@description('The managed identity name of the logs') +param logsMSI string + +@description('The service account name of the logs managed identity') +param logsServiceAccount string + +@description('The name of the Admin API managed identity') +param adminApiMIName string + +@description('The namespace of the Admin API managed identity') +param adminApiNamespace string + +@description('The service account name of the Admin API managed identity') +param adminApiServiceAccountName string + +// +// M A N A G E D I D E N T I T I E S +// + +var workloadIdentities = items({ + logs_wi: { + uamiName: logsMSI + namespace: logsNamespace + serviceAccountName: logsServiceAccount + } + prom_wi: { + uamiName: 'prometheus' + namespace: 'prometheus' + serviceAccountName: 'prometheus' + } + admin_api_wi: { + uamiName: adminApiMIName + namespace: adminApiNamespace + serviceAccountName: adminApiServiceAccountName + } +}) + +module managedIdentities '../modules/managed-identities.bicep' = { + name: 'managed-identities' + params: { + location: location + manageIdentityNames: [for wi in workloadIdentities: wi.value.uamiName] + } +} + +// +// A K S +// + +resource sreToolingClusterNSG 'Microsoft.Network/networkSecurityGroups@2023-11-01' = { + location: location + name: 'sre-tooling-cluster-node-nsg' + properties: { + securityRules: [] + } +} + +var vnetName = 'aks-net' +var nodeSubnetName = 'ClusterSubnet-001' + +module vnetCreation '../modules/network/vnet.bicep' = { + name: 'vnet-${vnetName}-creation' + params: { + location: location + vnetName: vnetName + vnetAddressPrefix: vnetAddressPrefix + enableSwift: false + deploymentMsiId: globalMSIId + } +} + +module nodeSubnetCreation '../modules/network/aks-node-subnet.bicep' = { + name: 'subnet-${nodeSubnetName}-creation' + params: { + vnetName: vnetName + subnetName: nodeSubnetName + subnetNSGId: sreToolingClusterNSG.id + subnetPrefix: subnetPrefix + } + dependsOn: [ + vnetCreation + ] +} + +module sreToolingCluster '../modules/aks-cluster-base.bicep' = { + name: 'cluster-${uniqueString(resourceGroup().name)}' + scope: resourceGroup() + params: { + location: location + ipResourceGroup: regionalResourceGroup + ipZones: locationAvailabilityZoneList + aksClusterName: aksClusterName + aksNodeResourceGroupName: aksNodeResourceGroupName + aksEtcdKVEnableSoftDelete: aksEtcdKVEnableSoftDelete + aksClusterOutboundIPAddressIPTags: aksClusterOutboundIPAddressIPTags + kubernetesVersion: kubernetesVersion + vnetName: vnetName + nodeSubnetId: nodeSubnetCreation.outputs.subnetId + podSubnetPrefix: podSubnetPrefix + clusterType: 'sre-tooling-cluster' + userOsDiskSizeGB: userOsDiskSizeGB + userAgentMinCount: userAgentMinCount + userAgentMaxCount: userAgentMaxCount + userAgentVMSize: userAgentVMSize + userAgentPoolCount: userAgentPoolCount + userAgentPoolZones: length(csvToArray(userAgentPoolZones)) > 0 + ? csvToArray(userAgentPoolZones) + : locationAvailabilityZoneList + userZoneRedundantMode: userZoneRedundantMode + infraAgentMinCount: infraAgentMinCount + infraAgentMaxCount: infraAgentMaxCount + infraAgentVMSize: infraAgentVMSize + infraAgentPoolCount: infraAgentPoolCount + infraAgentPoolZones: length(csvToArray(infraAgentPoolZones)) > 0 + ? csvToArray(infraAgentPoolZones) + : locationAvailabilityZoneList + infraOsDiskSizeGB: infraOsDiskSizeGB + infraZoneRedundantMode: infraZoneRedundantMode + systemOsDiskSizeGB: aksSystemOsDiskSizeGB + systemAgentMinCount: systemAgentMinCount + systemAgentMaxCount: systemAgentMaxCount + systemAgentVMSize: systemAgentVMSize + systemAgentPoolCount: systemAgentPoolCount + systemAgentPoolZones: length(csvToArray(systemAgentPoolZones)) > 0 + ? csvToArray(systemAgentPoolZones) + : locationAvailabilityZoneList + systemZoneRedundantMode: systemZoneRedundantMode + networkDataplane: aksNetworkDataplane + networkPolicy: aksNetworkPolicy + workloadIdentities: workloadIdentities + aksKeyVaultName: aksKeyVaultName + aksKeyVaultTagName: aksKeyVaultTagName + aksKeyVaultTagValue: aksKeyVaultTagValue + pullAcrResourceIds: [svcAcrResourceId] + deploymentMsiId: globalMSIId + enableSwiftV2Nodepools: false + deployIstio: false + } + dependsOn: [ + managedIdentities + ] +} + +output aksClusterName string = sreToolingCluster.outputs.aksClusterName + +// +// L O G S +// + +// +// M E T R I C S +// + +module dataCollection '../modules/metrics/datacollection.bicep' = { + name: 'metrics-infra' + params: { + azureMonitorWorkspaceLocation: location + azureMonitoringWorkspaceId: azureMonitoringWorkspaceId + aksClusterName: aksClusterName + prometheusPrincipalId: mi.getManagedIdentityByName(managedIdentities.outputs.managedIdentities, 'prometheus').uamiPrincipalID + } + dependsOn: [ + sreToolingCluster + ] +} + +// +// K E Y V A U L T S +// + +module logsServiceKeyVaultAccess '../modules/keyvault/keyvault-secret-access.bicep' = { + name: guid(serviceKeyVaultName, logsMSI, 'certuser') + scope: resourceGroup(serviceKeyVaultResourceGroup) + params: { + keyVaultName: serviceKeyVaultName + roleName: 'Key Vault Certificate User' + managedIdentityPrincipalIds: [ + mi.getManagedIdentityByName(managedIdentities.outputs.managedIdentities, logsMSI).uamiPrincipalID + ] + } +} diff --git a/dev-infrastructure/templates/sre-tooling-infra-lookup.bicep b/dev-infrastructure/templates/sre-tooling-infra-lookup.bicep new file mode 100644 index 0000000000..c2c3fca2cd --- /dev/null +++ b/dev-infrastructure/templates/sre-tooling-infra-lookup.bicep @@ -0,0 +1,18 @@ +@description('The name of the service keyvault') +param serviceKeyVaultName string + +@description('The name of the resource group for the service keyvault') +param serviceKeyVaultResourceGroup string = resourceGroup().name + +var deploymentNameSuffix = uniqueString(resourceGroup().id) + +module serviceKeyVault '../modules/keyvault/lookup.bicep' = { + name: 'sre-tooling-kv-${deploymentNameSuffix}' + scope: resourceGroup(serviceKeyVaultResourceGroup) + params: { + keyVaultName: serviceKeyVaultName + } +} + +output sreToolingKeyVaultName string = serviceKeyVault.outputs.keyVaultName +output sreToolingKeyVaultUrl string = serviceKeyVault.outputs.keyVaultUrl diff --git a/dev-infrastructure/templates/sre-tooling-infra.bicep b/dev-infrastructure/templates/sre-tooling-infra.bicep new file mode 100644 index 0000000000..ac6a64bc17 --- /dev/null +++ b/dev-infrastructure/templates/sre-tooling-infra.bicep @@ -0,0 +1,103 @@ +@description('The name of the service keyvault') +param serviceKeyVaultName string + +@description('The name of the resource group for the service keyvault') +param serviceKeyVaultResourceGroup string = resourceGroup().name + +@description('The location of the resource group for the service keyvault') +param serviceKeyVaultLocation string = resourceGroup().location + +@description('Soft delete setting for service keyvault') +param serviceKeyVaultSoftDelete bool = true + +@description('If true, make the service keyvault private and only accessible by the svc cluster via private link.') +param serviceKeyVaultPrivate bool = true + +// KV tagging +param serviceKeyVaultTagName string +param serviceKeyVaultTagValue string + +@description('KV certificate officer principal ID') +param kvCertOfficerPrincipalId string + +@description('MSI that will be used during pipeline runs') +param globalMSIId string + +// Reader role +// https://www.azadvertizer.net/azrolesadvertizer/acdd72a7-3385-48ef-bd42-f606fba81ae7.html +var readerRoleId = subscriptionResourceId( + 'Microsoft.Authorization/roleDefinitions', + 'acdd72a7-3385-48ef-bd42-f606fba81ae7' +) + +// service deployments running as the aroDevopsMsi need to lookup metadata about all kinds +// of resources, e.g. AKS metadata, database metadata, MI metadata, etc. +resource aroDevopsMSIReader 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(resourceGroup().id, globalMSIId, readerRoleId) + properties: { + principalId: reference(globalMSIId, '2023-01-31').principalId + principalType: 'ServicePrincipal' + roleDefinitionId: readerRoleId + } +} + +// +// K E Y V A U L T S +// + +var deploymentNameSuffix = uniqueString(resourceGroup().id) + +module serviceKeyVault '../modules/keyvault/keyvault.bicep' = { + name: 'svc-kv-${deploymentNameSuffix}' + scope: resourceGroup(serviceKeyVaultResourceGroup) + params: { + location: serviceKeyVaultLocation + keyVaultName: serviceKeyVaultName + private: serviceKeyVaultPrivate + enableSoftDelete: serviceKeyVaultSoftDelete + tagKey: serviceKeyVaultTagName + tagValue: serviceKeyVaultTagValue + } +} + +module serviceKeyVaultCertOfficer '../modules/keyvault/keyvault-secret-access.bicep' = { + name: 'svc-kv-cert-officer-${deploymentNameSuffix}' + scope: resourceGroup(serviceKeyVaultResourceGroup) + params: { + keyVaultName: serviceKeyVaultName + roleName: 'Key Vault Certificates Officer' + managedIdentityPrincipalIds: [kvCertOfficerPrincipalId] + } + dependsOn: [ + serviceKeyVault + ] +} + +module serviceKeyVaultSecretsOfficer '../modules/keyvault/keyvault-secret-access.bicep' = { + name: 'svc-kv-secret-officer-${deploymentNameSuffix}' + scope: resourceGroup(serviceKeyVaultResourceGroup) + params: { + keyVaultName: serviceKeyVaultName + roleName: 'Key Vault Secrets Officer' + managedIdentityPrincipalIds: [kvCertOfficerPrincipalId] + } + dependsOn: [ + serviceKeyVault + ] +} + +module serviceKeyVaultDevopsSecretsOfficer '../modules/keyvault/keyvault-secret-access.bicep' = { + name: 'svc-kv-devops-secret-officer-${deploymentNameSuffix}' + scope: resourceGroup(serviceKeyVaultResourceGroup) + params: { + keyVaultName: serviceKeyVaultName + roleName: 'Key Vault Secrets Officer' + managedIdentityPrincipalIds: [reference(globalMSIId, '2023-01-31').principalId] + } + dependsOn: [ + serviceKeyVault + ] +} + +output svcKeyVaultName string = serviceKeyVault.outputs.kvName +output svcKeyVaultUrl string = serviceKeyVault.outputs.kvUrl diff --git a/dev-infrastructure/templates/sre-tooling-mgmt-permissions.bicep b/dev-infrastructure/templates/sre-tooling-mgmt-permissions.bicep new file mode 100644 index 0000000000..e5985d6ba7 --- /dev/null +++ b/dev-infrastructure/templates/sre-tooling-mgmt-permissions.bicep @@ -0,0 +1,77 @@ +@description('The name of the CX KeyVault') +param cxKeyVaultName string + +@description('The name of the MSI KeyVault') +param msiKeyVaultName string + +@description('MSI credentials refresher MI resource ID, used to grant KeyVault access') +param msiRefresherMIResourceId string + +@description('CS MI resource ID, used to grant KeyVault access') +param clusterServiceMIResourceId string + +@description('Admin API MI resource ID, used to grant resource group introspection access') +param adminApiMIResourceId string + +resource cxKeyVault 'Microsoft.KeyVault/vaults@2024-04-01-preview' existing = { + name: cxKeyVaultName +} + +resource msiKeyVault 'Microsoft.KeyVault/vaults@2024-04-01-preview' existing = { + name: msiKeyVaultName +} + +// +// C L U S T E R S E R V I C E K V A C C E S S +// + +import * as res from '../modules/resource.bicep' + +module csKeyVaultAccess '../modules/mgmt-kv-access.bicep' = if (res.isMsiResourceId(clusterServiceMIResourceId)) { + name: 'cs-msi-kv-access' + params: { + managedIdentityResourceIds: [clusterServiceMIResourceId] + cxKeyVaultName: cxKeyVault.name + msiKeyVaultName: msiKeyVault.name + } +} + +// +// M S I C R E D E N T I A L S R E F R E S H E R K V A C C E S S +// + +module msiRefresherKeyVaultAccess '../modules/mgmt-kv-access.bicep' = if (res.isMsiResourceId(msiRefresherMIResourceId)) { + name: 'msi-refresher-msi-kv-access' + params: { + managedIdentityResourceIds: [msiRefresherMIResourceId] + cxKeyVaultName: '' + msiKeyVaultName: msiKeyVault.name + } +} + +// +// A D M I N A P I R E S O U R C E G R O U P I N T R O S P E C T I O N A C C E S S +// + +// Reader role +// https://www.azadvertizer.net/azrolesadvertizer/acdd72a7-3385-48ef-bd42-f606fba81ae7.html +var readerRoleId = subscriptionResourceId( + 'Microsoft.Authorization/roleDefinitions', + 'acdd72a7-3385-48ef-bd42-f606fba81ae7' +) + +var adminApiMIRef = res.msiRefFromId(adminApiMIResourceId) +resource adminApiMSI 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' existing = { + scope: resourceGroup(adminApiMIRef.resourceGroup.subscriptionId, adminApiMIRef.resourceGroup.name) + name: adminApiMIRef.name +} + +resource resourceGroupReaderRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (res.isMsiResourceId(adminApiMIResourceId)) { + scope: resourceGroup() + name: guid(resourceGroup().id, adminApiMIResourceId, '00000000-0000-0000-0000-000000000001') + properties: { + roleDefinitionId: readerRoleId + principalId: adminApiMSI.properties.principalId + principalType: 'ServicePrincipal' + } +} diff --git a/observability/prometheus/Makefile.sre-tooling b/observability/prometheus/Makefile.sre-tooling new file mode 100644 index 0000000000..f22e3eadf7 --- /dev/null +++ b/observability/prometheus/Makefile.sre-tooling @@ -0,0 +1,60 @@ +# Makefile for deploying Prometheus to SRE Tooling cluster +# This is a separate Makefile to avoid triggering main CI/CD tests +# Usage: make -f Makefile.sre-tooling deploy + +SRE_TOOLING_RG ?= hcp-underlay-pers-westus3-sre-tooling +SRE_TOOLING_SUBSCRIPTION_ID ?= 1d3378d3-5a3f-4712-85a1-2485495dfc4b +AKS_CLUSTER_NAME ?= pers-westus3-sre-tooling +KUBECONFIG ?= /tmp/sre-tooling-kubeconfig + +.PHONY: deploy get-values check + +check: + @[ -n "${SRE_TOOLING_RG}" ] || ( echo "ERROR: SRE_TOOLING_RG is not set"; exit 1 ) + @[ -n "${SRE_TOOLING_SUBSCRIPTION_ID}" ] || ( echo "ERROR: SRE_TOOLING_SUBSCRIPTION_ID is not set"; exit 1 ) + @[ -n "${AKS_CLUSTER_NAME}" ] || ( echo "ERROR: AKS_CLUSTER_NAME is not set"; exit 1 ) + +get-values: check + @echo "=== Getting cluster values ===" + @DCE_NAME=$$(az monitor data-collection endpoint list --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} --query "[0].name" -o tsv); \ + if [ -z "$$DCE_NAME" ]; then \ + echo "ERROR: Could not find DCE. Make sure cluster is deployed."; \ + exit 1; \ + fi; \ + DCE_ENDPOINT=$$(az monitor data-collection endpoint show --name "$$DCE_NAME" --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} --query "properties.metricsIngestion.endpoint" -o tsv); \ + DCR_NAME=$$(az monitor data-collection rule list --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} --query "[0].name" -o tsv); \ + if [ -z "$$DCR_NAME" ]; then \ + echo "ERROR: Could not find DCR. Make sure cluster is deployed."; \ + exit 1; \ + fi; \ + DCR_IMMUTABLE_ID=$$(az monitor data-collection rule show --name "$$DCR_NAME" --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} --query "properties.immutableId" -o tsv); \ + PROMETHEUS_MSI=$$(az identity show --name prometheus --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} --query "clientId" -o tsv); \ + echo "DCE_ENDPOINT=$$DCE_ENDPOINT" > /tmp/sre-tooling-prometheus-values.env; \ + echo "DCR_IMMUTABLE_ID=$$DCR_IMMUTABLE_ID" >> /tmp/sre-tooling-prometheus-values.env; \ + echo "PROMETHEUS_MSI=$$PROMETHEUS_MSI" >> /tmp/sre-tooling-prometheus-values.env; \ + echo "Values saved to /tmp/sre-tooling-prometheus-values.env" + +get-kubeconfig: check + @echo "=== Getting kubeconfig ===" + @az aks get-credentials --name ${AKS_CLUSTER_NAME} --resource-group ${SRE_TOOLING_RG} --subscription ${SRE_TOOLING_SUBSCRIPTION_ID} --overwrite-existing --file ${KUBECONFIG} >/dev/null 2>&1 + @kubelogin convert-kubeconfig -l azurecli --kubeconfig ${KUBECONFIG} >/dev/null 2>&1 || true + @echo "Kubeconfig saved to ${KUBECONFIG}" + +create-namespace: get-kubeconfig + @echo "=== Creating namespace ===" + @kubectl --kubeconfig ${KUBECONFIG} apply -f namespace.sre-tooling.yaml + +deploy: get-values create-namespace + @echo "=== Deploying Prometheus ===" + @. /tmp/sre-tooling-prometheus-values.env; \ + DCR_URL="$$DCE_ENDPOINT/dataCollectionRules/$$DCR_IMMUTABLE_ID/streams/Microsoft-PrometheusMetrics/api/v1/write?api-version=2023-04-24"; \ + helm upgrade --install arohcp-monitor deploy/ \ + --kubeconfig ${KUBECONFIG} \ + --namespace prometheus \ + --values values-sre-tooling-direct.yaml \ + --set prometheusSpec.externalLabels.cluster=${AKS_CLUSTER_NAME} \ + --set prometheusSpec.remoteWriteUrl="$$DCR_URL" \ + --set prometheus.serviceAccount.managedIdentity=$$PROMETHEUS_MSI \ + --wait --timeout 10m + @echo "=== Prometheus deployment complete ===" + diff --git a/observability/prometheus/namespace.sre-tooling.yaml b/observability/prometheus/namespace.sre-tooling.yaml new file mode 100644 index 0000000000..6e457d3a0b --- /dev/null +++ b/observability/prometheus/namespace.sre-tooling.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: "prometheus" + labels: + "network.openshift.io/policy-group": "" diff --git a/observability/prometheus/values-sre-tooling-direct.yaml b/observability/prometheus/values-sre-tooling-direct.yaml new file mode 100644 index 0000000000..4ff7fd86aa --- /dev/null +++ b/observability/prometheus/values-sre-tooling-direct.yaml @@ -0,0 +1,95 @@ +kube-prometheus-stack: + fullnameOverride: "prometheus" + crds: + enabled: true + global: + rbac: + create: true + alertmanager: + enabled: false + grafana: + enabled: false + # Azure Managed Prometheus handles these + coreDns: + enabled: false + kubeDns: + enabled: false + nodeExporter: + enabled: false + kubeEtcd: + enabled: false + kubeScheduler: + enabled: false + kubeControllerManager: + enabled: false + kubeProxy: + enabled: false + kubelet: + enabled: false + kubeApiServer: + enabled: false + kubeStateMetrics: + enabled: true + prometheus: + enabled: false # Prometheus is deployed using templates/prometheus.yaml + prometheusOperator: + enabled: true + fullnameOverride: "" + image: + registry: mcr.microsoft.com/oss/v2 + repository: prometheus/prometheus-operator + sha: 5c3fd99a70fb43aa5ea4672ccaf04c68ac60d7adb7e205d2bf75d1b46395455c # v0.87.0-1 + prometheusConfigReloader: + image: + registry: mcr.microsoft.com/oss/v2 + repository: prometheus/prometheus-config-reloader + sha: 807993bae8544ca5dba1a088507c39adce29fef1afd2fb7492247b09e0cfa2a9 # v0.87.0 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: aro-hcp.azure.com/role + operator: In + values: + - "infra" + tolerations: + - key: "infra" + operator: "Equal" + value: "true" + effect: "NoSchedule" + cleanPrometheusOperatorObjectNames: true + kube-state-metrics: + image: + registry: mcr.microsoft.com/oss/v2 + repository: kubernetes/kube-state-metrics + digest: sha256:377bd55d97824dfee27b3c577869bc793fc3970623433627c7f0ee1b1b74e725 # v2.17.0 +# Prometheus spec configuration +prometheusSpec: + image: + registry: mcr.microsoft.com/oss/v2 + repository: prometheus/prometheus + sha: 70296b2f2cc69c88d070fe8b627d347bd070e084c14babbc93e91583e03eca06 # v3.8.0-1 + version: "" + externalLabels: + cluster: "" # Will be set from cluster name + remoteWriteUrl: "" # Will be set from cluster outputs + hcpRemoteWriteUrl: "NONE" + zoneCount: 3 + maximumStartupDurationSeconds: 360 + retention: 6h + retentionSize: 45GiB + resources: + requests: + cpu: 1 + memory: 2Gi + limits: + cpu: 2 + memory: "6Gi" +prometheus: + prometheusSpec: + shards: 1 + replicas: 1 + serviceAccount: + managedIdentity: "" # Will be set from cluster outputs +environment: "" # Not used for sre-tooling