Skip to content

Commit 3d00723

Browse files
authored
Resolve deployment failure that is caused by insufficient memory. (#314)
* resolve issue that caused by insufficient memory. * fail fast for insufficient memory. * caculate the memory according to replicas. * exclude B-series VM sizes. * enhance the UI. * increase pom * AKS restricted skus: Standard_A1_v2. * https://aka.ms/aks/restricted-skus * make sure `aksNodeMaxCount` is greaterOrEquals `aksNodeCount`. * for testing * use bc for the floating-point calculation. * increase pom * fix typos * for small VM size, may need more time to provision WLS pods. * exclude the vm size that WLS does not work well in. * add warning message. * fix corner case * tune livenessProbe and readinessProbe for A-series VM sizes. * tune ReadinessProbe * support Standard_A2_v2. * do not validate application if no app is deployed.
1 parent 6bd2328 commit 3d00723

File tree

7 files changed

+137
-32
lines changed

7 files changed

+137
-32
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
<properties>
4141
<!-- versions start -->
4242
<!-- weblogic azure aks versions -->
43-
<version.wls-on-aks-azure-marketplace>1.0.76</version.wls-on-aks-azure-marketplace>
43+
<version.wls-on-aks-azure-marketplace>1.0.77</version.wls-on-aks-azure-marketplace>
4444
<!-- weblogic azure vm versions -->
4545
<version.arm-oraclelinux-wls>1.0.27</version.arm-oraclelinux-wls>
4646
<version.arm-oraclelinux-wls-admin>1.0.51</version.arm-oraclelinux-wls-admin>

weblogic-azure-aks/src/main/arm/createUiDefinition.json

Lines changed: 86 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@
1818
"type": "Microsoft.Common.Section",
1919
"label": "Credentials for WebLogic",
2020
"elements": [
21+
{
22+
"name": "listVMSizes",
23+
"type": "Microsoft.Solutions.ArmApiControl",
24+
"request": {
25+
"method": "GET",
26+
"path": "[concat(subscription().id, '/providers/Microsoft.Compute/locations/',location(),'/vmSizes?api-version=2024-03-01')]"
27+
}
28+
},
2129
{
2230
"name": "wlsUserName",
2331
"type": "Microsoft.Common.TextBox",
@@ -311,14 +319,81 @@
311319
"uri": "https://aka.ms/wls-aks-well-tested-version"
312320
}
313321
}
314-
},
322+
},
323+
{
324+
"name": "nodeVMSizeSelector",
325+
"type": "Microsoft.Compute.SizeSelector",
326+
"label": "Node size",
327+
"toolTip": "The size of virtual machine to provision.",
328+
"recommendedSizes": [
329+
"Standard_DS2_v2"
330+
],
331+
"constraints": {
332+
"numAvailabilityZonesRequired": 3,
333+
"zone": "3",
334+
"excludedSizes": [
335+
"Standard_A0",
336+
"Standard_A1",
337+
"Standard_A1_v2",
338+
"Standard_F1",
339+
"Standard_F1s",
340+
"Standard_B12ms",
341+
"Standard_B16als_v2",
342+
"Standard_B16as_v2",
343+
"Standard_B16ls_v2",
344+
"Standard_B16ms",
345+
"Standard_B16pls_v2",
346+
"Standard_B16ps_v2",
347+
"Standard_B16s_v2",
348+
"Standard_B1ls",
349+
"Standard_B1ms",
350+
"Standard_B1s",
351+
"Standard_B20ms",
352+
"Standard_B2als_v2",
353+
"Standard_B2as_v2",
354+
"Standard_B2ats_v2",
355+
"Standard_B2ls_v2",
356+
"Standard_B2ms",
357+
"Standard_B2pls_v2",
358+
"Standard_B2ps_v2",
359+
"Standard_B2pts_v2",
360+
"Standard_B2s",
361+
"Standard_B2s_v2",
362+
"Standard_B2ts_v2",
363+
"Standard_B32als_v2",
364+
"Standard_B32as_v2",
365+
"Standard_B32ls_v2",
366+
"Standard_B32s_v2",
367+
"Standard_B4als_v2",
368+
"Standard_B4as_v2",
369+
"Standard_B4ls_v2",
370+
"Standard_B4ms",
371+
"Standard_B4pls_v2",
372+
"Standard_B4ps_v2",
373+
"Standard_B4s_v2",
374+
"Standard_B8als_v2",
375+
"Standard_B8as_v2",
376+
"Standard_B8ls_v2",
377+
"Standard_B8ms",
378+
"Standard_B8pls_v2",
379+
"Standard_B8ps_v2",
380+
"Standard_B8s_v2"
381+
]
382+
},
383+
"options": {
384+
"hideDiskTypeFilter": false
385+
},
386+
"osPlatform": "Linux",
387+
"count": "[steps('section_aks').clusterInfo.aksNodeCount]",
388+
"visible": "[bool(steps('section_aks').clusterInfo.createAKSCluster)]"
389+
},
315390
{
316391
"name": "aksNodeCount",
317392
"type": "Microsoft.Common.Slider",
318-
"min": 1,
319-
"max": 100,
320-
"label": "Minimum node count",
393+
"min": "[add(1, div(add(12288, mul(if(empty(basics('basicsOptional').wlsClusterSize),5,basics('basicsOptional').wlsClusterSize), 1536)), first(filter(basics('basicsRequired').listVMSizes.value, (item) => equals(item.name, steps('section_aks').clusterInfo.nodeVMSizeSelector))).memoryInMB))]",
321394
"defaultValue": 3,
395+
"max": 998,
396+
"label": "Minimum node count",
322397
"showStepMarkers": false,
323398
"toolTip": "Set the minimum node count for the cluster.",
324399
"constraints": {
@@ -329,35 +404,16 @@
329404
{
330405
"name": "aksNodeMaxCount",
331406
"type": "Microsoft.Common.Slider",
332-
"min": "[int(steps('section_aks').clusterInfo.aksNodeCount)]",
333-
"max": 100,
407+
"min": "[add(steps('section_aks').clusterInfo.aksNodeCount,2)]",
408+
"defaultValue": 3,
409+
"max": 1000,
334410
"label": "Maximum node count",
335-
"defaultValue": 5,
336411
"showStepMarkers": false,
337412
"toolTip": "Set the maximum node count for the cluster.",
338413
"constraints": {
339414
"required": true
340415
},
341416
"visible": "[bool(steps('section_aks').clusterInfo.createAKSCluster)]"
342-
},
343-
{
344-
"name": "nodeVMSizeSelector",
345-
"type": "Microsoft.Compute.SizeSelector",
346-
"label": "Node size",
347-
"toolTip": "The size of virtual machine to provision.",
348-
"recommendedSizes": [
349-
"Standard_DS2_v2"
350-
],
351-
"constraints": {
352-
"numAvailabilityZonesRequired": 3,
353-
"zone": "3"
354-
},
355-
"options": {
356-
"hideDiskTypeFilter": false
357-
},
358-
"osPlatform": "Linux",
359-
"count": "[steps('section_aks').clusterInfo.aksNodeCount]",
360-
"visible": "[bool(steps('section_aks').clusterInfo.createAKSCluster)]"
361417
}
362418
]
363419
},
@@ -2024,7 +2080,8 @@
20242080
}
20252081
]
20262082
},
2027-
"visible": "[and(bool(steps('section_database').enableDB), not(and(steps('section_database').databaseConnectionInfo.enablePswlessConnection0, equals(steps('section_database').databaseConnectionInfo.databaseType, 'sqlserver'))))]" },
2083+
"visible": "[and(bool(steps('section_database').enableDB), not(and(steps('section_database').databaseConnectionInfo.enablePswlessConnection0, equals(steps('section_database').databaseConnectionInfo.databaseType, 'sqlserver'))))]"
2084+
},
20282085
{
20292086
"name": "enablePswlessConnection",
20302087
"type": "Microsoft.Common.CheckBox",
@@ -2258,7 +2315,7 @@
22582315
"visible": "[equals(steps('section_autoScaling').autoScalingInfo.kmsMetrics, 'memory')]"
22592316
}
22602317
],
2261-
"visible": "[bool(steps('section_autoScaling').enableAutoscaling)]"
2318+
"visible": "[bool(steps('section_autoScaling').enableAutoscaling)]"
22622319
}
22632320
]
22642321
}
@@ -2363,4 +2420,4 @@
23632420
"wlsUserName": "[basics('basicsRequired').wlsUserName]"
23642421
}
23652422
}
2366-
}
2423+
}

weblogic-azure-aks/src/main/arm/scripts/common.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# This script runs on Azure Container Instance with Alpine Linux that Azure Deployment script creates.
44

55
export checkPodStatusInterval=20 # interval of checking pod status.
6-
export checkPodStatusMaxAttemps=100 # max attempt to check pod status.
6+
export checkPodStatusMaxAttemps=200 # max attempt to check pod status.
77
export checkPVStateInterval=5 # interval of checking pvc status.
88
export checkPVStateMaxAttempt=10 # max attempt to check pvc status.
99
export checkSVCStateMaxAttempt=50
@@ -36,6 +36,12 @@ export constMSSQLDriverName="mssql-jdbc-10.2.1.jre8.jar"
3636
export constAzureCoreVersion="1.34.0"
3737
export constDbPodIdentitySelector="db-pod-identity" # do not change the value
3838
export constPreclassDirectoryName="preclassLibraries"
39+
export constLivenessProbePeriodSeconds=30
40+
export constLivenessProbeTimeoutSeconds=5
41+
export constLivenessProbeFailureThreshold=20
42+
export constReadinessProbeProbePeriodSeconds=10
43+
export constReadinessProbeTimeoutSeconds=5
44+
export constReadinessProbeFailureThreshold=3
3945

4046
export curlMaxTime=120 # seconds
4147
export ocrLoginServer="container-registry.oracle.com"

weblogic-azure-aks/src/main/arm/scripts/genDomainConfig.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,16 @@ spec:
8282
8383
# Settings for all server pods in the domain including the introspector job pod
8484
serverPod:
85+
# Tune for small VM sizes
86+
# https://oracle.github.io/weblogic-kubernetes-operator/managing-domains/domain-lifecycle/liveness-readiness-probe-customization/
87+
livenessProbe:
88+
periodSeconds: ${constLivenessProbePeriodSeconds}
89+
timeoutSeconds: ${constLivenessProbeTimeoutSeconds}
90+
failureThreshold: ${constLivenessProbeFailureThreshold}
91+
readinessProbe:
92+
periodSeconds: ${constReadinessProbeProbePeriodSeconds}
93+
timeoutSeconds: ${constReadinessProbeTimeoutSeconds}
94+
failureThreshold: ${constReadinessProbeFailureThreshold}
8595
# Optional new or overridden environment variables for the domain's pods
8696
# - This sample uses CUSTOM_DOMAIN_NAME in its image model file
8797
# to set the Weblogic domain name

weblogic-azure-aks/src/main/arm/scripts/inline-scripts/validateParameters.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,29 @@ function validate_compute_resources() {
126126
echo_stdout "Check compute resources: passed!"
127127
}
128128

129+
# Ensure the cluster has enough memory resources.
130+
# The offer deploys a WLS cluster with 1 + ${APP_REPLICAS} pods, each pod requestes 1.5GB and 0.25CPU.
131+
# Minimum memory requirement: 12 + (APP_REPLICAS + 1)*1.5 GB
132+
function validate_memory_resources() {
133+
if [[ "${createAKSCluster,,}" == "true" ]]; then
134+
local requiredMemoryinGB=$(echo "12+($APP_REPLICAS+1)*1.5" | bc)
135+
136+
local vmDetails=$(az vm list-skus --size ${aksAgentPoolVMSize} -l ${location} --query [0])
137+
validate_status "Query VM details of ${aksAgentPoolVMSize} in ${location}."
138+
139+
local memoryGB=$(echo ${vmDetails} | jq '.capabilities[] | select(.name=="MemoryGB") | .value' | tr -d "\"")
140+
local requestedMemory=$(echo "$aksAgentPoolNodeCount*$memoryGB" | bc)
141+
echo_stdout "Current requested memory is ${requestedMemory}GB."
142+
if [[ $(echo "${requestedMemory}<${requiredMemoryinGB}" | bc) -eq 1 ]]; then
143+
echo_stderr "It requires ${requiredMemoryinGB} GiB memory to create the AKS cluster, you have to select a larger VM size or increase node count."
144+
exit 1
145+
fi
146+
147+
fi
148+
149+
echo_stdout "Check memory resources: passed!"
150+
}
151+
129152
function validate_ocr_account() {
130153
# install docker cli
131154
install_docker
@@ -601,6 +624,8 @@ sslCertificateKeyVaultOption="keyVaultStoredConfig"
601624

602625
validate_compute_resources
603626

627+
validate_memory_resources
628+
604629
validate_base_image_path
605630

606631
validate_acr_admin_enabled

weblogic-azure-aks/src/main/bicep/mainTemplate.bicep

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ var const_hasTags = contains(resourceGroup(), 'tags')
324324
// * generate selfsigned certificate for gateway frontend TLS/SSL.
325325
var const_bCreateNewKeyVault = (!const_hasTags || !contains(resourceGroup().tags, name_tagNameForKeyVault) || empty(resourceGroup().tags.wlsKeyVault)) && ((enableCustomSSL && sslConfigurationAccessOption != const_wlsSSLCertOptionKeyVault) || (enableAppGWIngress && (appGatewayCertificateOption != const_appGatewaySSLCertOptionHaveKeyVault)))
326326
var const_bCreateStorageAccount = (createAKSCluster || !const_hasStorageAccount) && const_enablePV
327+
var const_bValidateApplications= validateApplications && (length(appPackageUrls) > 0)
327328
var const_createNewAcr = useOracleImage && createACR
328329
var const_defaultKeystoreType = 'PKCS12'
329330
var const_enableNetworking = (length(lbSvcValues) > 0) || enableAppGWIngress
@@ -409,6 +410,7 @@ module validateInputs 'modules/_deployment-scripts/_ds-validate-parameters.bicep
409410
appGatewayCertificateOption: appGatewayCertificateOption
410411
appGatewaySSLCertData: appGatewaySSLCertData
411412
appGatewaySSLCertPassword: appGatewaySSLCertPassword
413+
appReplicas: appReplicas
412414
azCliVersion: const_azcliVersion
413415
createAKSCluster: createAKSCluster
414416
createDNSZone: createDNSZone
@@ -820,7 +822,7 @@ module passwordlessDatasourceDeployment 'modules/_setupPasswordlessDBConnection.
820822
* To check if all the applciations in WLS cluster become ACTIVE state after all configurations are completed.
821823
* This should be the last step.
822824
*/
823-
module validateApplciations 'modules/_deployment-scripts/_ds-validate-applications.bicep' = if (validateApplications) {
825+
module validateApplciations 'modules/_deployment-scripts/_ds-validate-applications.bicep' = if (const_bValidateApplications) {
824826
name: 'validate-wls-application-status'
825827
params: {
826828
_artifactsLocation: _artifactsLocation

weblogic-azure-aks/src/main/bicep/modules/_deployment-scripts/_ds-validate-parameters.bicep

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ param appGatewayCertificateOption string
1212
param appGatewaySSLCertData string
1313
@secure()
1414
param appGatewaySSLCertPassword string
15+
param appReplicas int
1516
param azCliVersion string = ''
1617
param createAKSCluster bool
1718
param createDNSZone bool
@@ -121,6 +122,10 @@ resource deploymentScript 'Microsoft.Resources/deploymentScripts@${azure.apiVers
121122
name: 'AKS_VERSION'
122123
value: aksVersion
123124
}
125+
{
126+
name: 'APP_REPLICAS'
127+
value: appReplicas
128+
}
124129
{
125130
name: 'WLS_SSL_KEYVAULT_NAME'
126131
value: sslKeyVaultName

0 commit comments

Comments
 (0)