Skip to content

Commit daa0969

Browse files
fix: dynamically resolve root device within disk_queue.service (#7527)
1 parent 7ca28e6 commit daa0969

15 files changed

+114
-3
lines changed

e2e/validation.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
5050
ValidateTLSBootstrapping(ctx, s)
5151
ValidateKubeletServingCertificateRotation(ctx, s)
5252
ValidateSystemdWatchdogForKubernetes132Plus(ctx, s)
53-
ValidateSystemdUnitIsNotFailed(ctx, s, "aks-log-collector")
53+
ValidateAKSLogCollector(ctx, s)
54+
ValidateDiskQueueService(ctx, s)
5455
ValidateLeakedSecrets(ctx, s)
5556
ValidateIPTablesCompatibleWithCiliumEBPF(ctx, s)
5657

e2e/validators.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,14 @@ func ValidateSystemdWatchdogForKubernetes132Plus(ctx context.Context, s *Scenari
180180
}
181181
}
182182

183+
func ValidateAKSLogCollector(ctx context.Context, s *Scenario) {
184+
ValidateSystemdUnitIsNotFailed(ctx, s, "aks-log-collector")
185+
}
186+
187+
func ValidateDiskQueueService(ctx context.Context, s *Scenario) {
188+
ValidateSystemdUnitIsRunning(ctx, s, "disk_queue.service")
189+
}
190+
183191
func ValidateLeakedSecrets(ctx context.Context, s *Scenario) {
184192
secrets := map[string]string{
185193
"client private key": base64.StdEncoding.EncodeToString([]byte(s.GetClientPrivateKey())),

parts/linux/cloud-init/artifacts/disk_queue.service

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Description=Set nr_requests and queue_depth based on experimental tuning
33

44
[Service]
55
Type=oneshot
6-
ExecStart=/usr/bin/env bash -c 'echo 128 > /sys/block/sda/queue/nr_requests && echo 128 > /sys/block/sda/device/queue_depth'
6+
ExecStart=sudo /bin/bash /opt/azure/containers/disk_queue.sh
77
RemainAfterExit=true
88
StandardOutput=journal
99

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
# SCSI systems will always have a "root" device link.
4+
# NVMe systems will always have an "os" device link.
5+
# In cases where both a "root" and "os" device link exist, they should always point to the same device.
6+
# details: https://learn.microsoft.com/en-us/azure/virtual-machines/linux/azure-virtual-machine-utilities
7+
if [ -L "/dev/disk/azure/root" ]; then
8+
LINK_PATH="/dev/disk/azure/root"
9+
elif [ -L "/dev/disk/azure/os" ]; then
10+
LINK_PATH="/dev/disk/azure/os"
11+
else
12+
echo "no root or os device link found within /dev/disk/azure, cannot apply disk tuning"
13+
exit 1
14+
fi
15+
16+
echo "found device link: $LINK_PATH"
17+
DEV_NAME=$(basename "$(readlink -f "$LINK_PATH")")
18+
echo "resolved root device: $DEV_NAME"
19+
20+
# shellcheck disable=SC3010
21+
if [[ "${DEV_NAME,,}" == *"nvme"* ]]; then
22+
# Disk tuning doesn't currently work as expected on NVMe devices - namely that the /device/queue_depth parameter
23+
# doesn't seem to be a settable option, and that the default /queue/nr_requests can actually be higher than what we
24+
# currently set on SCSI (128), which could end up hurting IO performance rather than optimize it.
25+
# TODO: reach out to NVMe team to see how we can better tune queue settings on NVMe devices.
26+
echo "$DEV_NAME is an NVMe device, will not apply disk tuning"
27+
exit 0
28+
fi
29+
30+
if [ ! -d "/sys/block/$DEV_NAME/queue" ]; then
31+
echo "queue settings directory for device: $DEV_NAME does not exist, unable to apply desired settings"
32+
exit 1
33+
fi
34+
35+
if [ ! -d "/sys/block/$DEV_NAME/device" ]; then
36+
echo "device settings directory for device: $DEV_NAME does not exist, unable to apply desired settings"
37+
exit 1
38+
fi
39+
40+
echo "will apply settings to /sys/block/$DEV_NAME/queue/nr_requests and /sys/block/$DEV_NAME/device/queue_depth"
41+
echo 128 > "/sys/block/$DEV_NAME/queue/nr_requests"
42+
echo 128 > "/sys/block/$DEV_NAME/device/queue_depth"

vhdbuilder/packer/imagecustomizer/azlosguard/azlosguard.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,9 @@ os:
186186
destination: /etc/profile.d/CIS.sh
187187
permissions: 755
188188
# disk queue perf tuning
189+
- source: /AgentBaker/parts/linux/cloud-init/artifacts/disk_queue.sh
190+
destination: /opt/azure/containers/disk_queue.sh
191+
permissions: 755
189192
- source: /AgentBaker/parts/linux/cloud-init/artifacts/disk_queue.service
190193
destination: /etc/systemd/system/disk_queue.service
191194
permissions: 644

vhdbuilder/packer/packer_source.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ copyPackerFiles() {
5757
NVIDIA_MODPROBE_SERVICE_DEST=/etc/systemd/system/nvidia-modprobe.service
5858
NVIDIA_DOCKER_DAEMON_SRC=/home/packer/nvidia-docker-daemon.json
5959
NVIDIA_DOCKER_DAEMON_DEST=/etc/systemd/system/nvidia-docker-daemon.json
60+
DISK_QUEUE_SCRIPT_SRC=/home/packer/disk_queue.sh
61+
DISK_QUEUE_SCRIPT_DEST=/opt/azure/containers/disk_queue.sh
6062
DISK_QUEUE_SERVICE_SRC=/home/packer/disk_queue.service
6163
DISK_QUEUE_SERVICE_DEST=/etc/systemd/system/disk_queue.service
6264
CGROUP_MEMORY_TELEMETRY_SERVICE_SRC=/home/packer/cgroup-memory-telemetry.service
@@ -342,6 +344,7 @@ copyPackerFiles() {
342344
cpAndMode $KMS_SERVICE_SRC $KMS_SERVICE_DEST 644
343345
cpAndMode $MIG_PARTITION_SRC $MIG_PARTITION_DEST 544
344346
cpAndMode $CONTAINERD_EXEC_START_SRC $CONTAINERD_EXEC_START_DEST 644
347+
cpAndMode $DISK_QUEUE_SCRIPT_SRC $DISK_QUEUE_SCRIPT_DEST 755
345348
cpAndMode $DISK_QUEUE_SERVICE_SRC $DISK_QUEUE_SERVICE_DEST 644
346349
cpAndMode $CGROUP_MEMORY_TELEMETRY_SERVICE_SRC $CGROUP_MEMORY_TELEMETRY_SERVICE_DEST 644
347350
cpAndMode $CGROUP_MEMORY_TELEMETRY_SCRIPT_SRC $CGROUP_MEMORY_TELEMETRY_SCRIPT_DEST 755

vhdbuilder/packer/test/linux-vhd-content-test.sh

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,8 @@ checkLocaldnsScriptsAndConfigs() {
17051705
return 0
17061706
}
17071707

1708+
#------------------------ End of test code related to localdns ------------------------
1709+
17081710
# Check that no files have a numeric UID or GID, which would indicate a file ownership issue.
17091711
testFileOwnership() {
17101712
local test="testFileOwnership"
@@ -1724,7 +1726,18 @@ testFileOwnership() {
17241726
return 0
17251727
}
17261728

1727-
#------------------------ End of test code related to localdns ------------------------
1729+
testDiskQueueServiceIsActive() {
1730+
local test="testDiskQueueServiceIsActive"
1731+
echo "$test: Start"
1732+
1733+
if systemctl is-active --quiet disk_queue.service; then
1734+
echo $test "disk_queue.service is active, as expected"
1735+
else
1736+
err $test "disk_queue.service is not active, status: $(systemctl show -p SubState --value disk_queue.service)"
1737+
fi
1738+
1739+
echo "$test:Finish"
1740+
}
17281741

17291742
# As we call these tests, we need to bear in mind how the test results are processed by the
17301743
# the caller in run-tests.sh. That code uses az vm run-command invoke to run this script
@@ -1777,3 +1790,4 @@ testCorednsBinaryExtractedAndCached $OS_VERSION
17771790
checkLocaldnsScriptsAndConfigs
17781791
testPackageDownloadURLFallbackLogic
17791792
testFileOwnership $OS_SKU
1793+
testDiskQueueServiceIsActive

vhdbuilder/packer/vhd-image-builder-arm64-gen2.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,11 @@
418418
"source": "parts/linux/cloud-init/artifacts/profile-d-cis.sh",
419419
"destination": "/home/packer/profile-d-cis.sh"
420420
},
421+
{
422+
"type": "file",
423+
"source": "parts/linux/cloud-init/artifacts/disk_queue.sh",
424+
"destination": "/home/packer/disk_queue.sh"
425+
},
421426
{
422427
"type": "file",
423428
"source": "parts/linux/cloud-init/artifacts/disk_queue.service",

vhdbuilder/packer/vhd-image-builder-base.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,11 @@
426426
"source": "parts/linux/cloud-init/artifacts/profile-d-cis.sh",
427427
"destination": "/home/packer/profile-d-cis.sh"
428428
},
429+
{
430+
"type": "file",
431+
"source": "parts/linux/cloud-init/artifacts/disk_queue.sh",
432+
"destination": "/home/packer/disk_queue.sh"
433+
},
429434
{
430435
"type": "file",
431436
"source": "parts/linux/cloud-init/artifacts/disk_queue.service",

vhdbuilder/packer/vhd-image-builder-cvm.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,11 @@
430430
"source": "parts/linux/cloud-init/artifacts/profile-d-cis.sh",
431431
"destination": "/home/packer/profile-d-cis.sh"
432432
},
433+
{
434+
"type": "file",
435+
"source": "parts/linux/cloud-init/artifacts/disk_queue.sh",
436+
"destination": "/home/packer/disk_queue.sh"
437+
},
433438
{
434439
"type": "file",
435440
"source": "parts/linux/cloud-init/artifacts/disk_queue.service",

0 commit comments

Comments
 (0)