proxmox-vm-autoscale/vm_manager.py at main · fabriziosalmi/proxmox-vm-autoscale · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
import logging
import re
import time
import threading


class VMResourceManager:
    def __init__(self, ssh_client, vm_id, config):
        self.ssh_client = ssh_client
        self.vm_id = vm_id
        self.config = config
        self.logger = logging.getLogger("vm_resource_manager")
        self.last_scale_time = 0
        self.scale_cooldown = self.config.get("scale_cooldown", 300)  # Default to 5 minutes
        self.scale_lock = threading.Lock()  # Added lock for scaling control
        self.auto_configure_hotplug = self.config.get("auto_configure_hotplug", True)

        # Auto-configure hotplug if enabled
        if self.auto_configure_hotplug:
            self._ensure_hotplug_configured()

    def _ensure_hotplug_configured(self):
        """Ensure hotplug and NUMA are enabled for this VM (for live scaling support)."""
        try:
            cpu_hotplug, memory_hotplug = self._check_hotplug_enabled()
            numa_enabled = self._check_numa_enabled()

            needs_update = False
            updates = []

            # Check if we need to enable hotplug for CPU/memory
            if not cpu_hotplug or not memory_hotplug:
                updates.append("-hotplug cpu,memory,network,disk,usb")
                needs_update = True
                self.logger.info(f"VM {self.vm_id}: Enabling hotplug for cpu,memory,network,disk,usb")

            # Check if we need to enable NUMA (required for memory hotplug)
            if not numa_enabled:
                updates.append("-numa 1")
                needs_update = True
                self.logger.info(f"VM {self.vm_id}: Enabling NUMA for memory hotplug support")

            if needs_update:
                command = f"qm set {self.vm_id} {' '.join(updates)}"
                output = self.ssh_client.execute_command(command)
                self._get_command_output(output)
                self.logger.info(
                    f"VM {self.vm_id}: Hotplug configuration updated. "
                    "Note: NUMA changes require a VM restart to take effect."
                )
        except Exception as e:
            self.logger.warning(f"Failed to auto-configure hotplug for VM {self.vm_id}: {e}")

    def _get_command_output(self, output):
        """Helper method to properly handle command output that might be a tuple."""
        if isinstance(output, tuple):
            # Assuming the first element contains the stdout
            return str(output[0]).strip() if output and output[0] is not None else ""
        return str(output).strip() if output is not None else ""

    def is_vm_running(self, retries=3, delay=5):
        """Check if the VM is running with retries and improved error handling."""
        for attempt in range(1, retries + 1):
            try:
                command = f"qm status {self.vm_id} --verbose"
                self.logger.debug(f"Executing command to check VM status: {command}")
                output = self.ssh_client.execute_command(command)
                output_str = self._get_command_output(output)
                self.logger.debug(f"Command output: {output_str}")

                if "status: running" in output_str.lower():
                    self.logger.info(f"VM {self.vm_id} is running.")
                    return True
                elif "status:" in output_str.lower():
                    self.logger.info(f"VM {self.vm_id} is not running.")
                    return False
                else:
                    self.logger.warning(
                        f"Unexpected output while checking VM status: {output_str}"
                    )
            except Exception as e:
                self.logger.warning(
                    f"Attempt {attempt}/{retries} failed to check VM status: {e}. Retrying..."
                )
                time.sleep(delay * attempt)  # Exponential backoff

        self.logger.error(
            f"Unable to determine status of VM {self.vm_id} after {retries} attempts."
        )
        return False

    def get_resource_usage(self):
        """Retrieve CPU and RAM usage as percentages."""
        try:
            if not self.is_vm_running():
                return 0.0, 0.0
            #command = f"qm status {self.vm_id} --verbose"
            # Updated command  - this might well be refinable to simpler and faster.
            vmid = self.vm_id
            command = f"pvesh get /cluster/resources | grep 'qemu/{vmid}' | awk -F '│' '{{print $6, $15, $16}}'"
            output = self.ssh_client.execute_command(command)
            # example output: "  3.17%     5.00 GiB     3.82 GiB "
            self.logger.info(f"VM status output: {output}")
            cpu_usage = self._parse_cpu_usage(output)
            ram_usage = self._parse_ram_usage(output)
            return cpu_usage, ram_usage
        except Exception as e:
            self.logger.error(f"Failed to retrieve resource usage: {e}")
            return 0.0, 0.0

    def can_scale(self):
        """Determine if scaling can occur using a lock to avoid race conditions."""
        with self.scale_lock:
            current_time = time.time()
            if current_time - self.last_scale_time < self.scale_cooldown:
                return False
            self.last_scale_time = current_time
            return True

    def scale_cpu(self, direction):
        """Scale the CPU cores and vCPUs of the VM."""
        if not self.can_scale():
            return False

        try:
            current_cores = self._get_current_cores()
            max_cores = self._get_max_cores()
            min_cores = self._get_min_cores()
            current_vcpus = self._get_current_vcpus()

            self.last_scale_time = time.time()
            if direction == "up" and current_cores < max_cores:
                self._scale_cpu_up(current_cores, current_vcpus)
                return True
            elif direction == "down" and current_cores > min_cores:
                self._scale_cpu_down(current_cores, current_vcpus)
                return True
            else:
                self.logger.info("No CPU scaling required.")
                return False
        except Exception as e:
            self.logger.error(f"Failed to scale CPU: {e}")
            raise

    def scale_ram(self, direction):
        """Scale the RAM of the VM."""
        if not self.can_scale():
            return False

        try:
            current_ram = self._get_current_ram()
            max_ram = self._get_max_ram()
            min_ram = self._get_min_ram()

            self.last_scale_time = time.time()
            if direction == "up" and current_ram < max_ram:
                new_ram = min(current_ram + 512, max_ram)
                self._set_ram(new_ram)
                return True
            elif direction == "down" and current_ram > min_ram:
                new_ram = max(current_ram - 512, min_ram)
                self._set_ram(new_ram)
                return True
            else:
                self.logger.info("No RAM scaling required.")
            return False
        except Exception as e:
            self.logger.error(f"Failed to scale RAM: {e}")
            raise

    def _parse_cpu_usage(self, output):
        """Parse CPU usage from VM status output."""
        try:
            output_str = self._get_command_output(output)
            percentage_cpu_match = re.search(r"^\s*(\d+(?:\.\d+)?)%", output_str)
            if percentage_cpu_match:
                return float(percentage_cpu_match.group(1))
            self.logger.warning("CPU usage not found in output.")
            return 0.0
        except Exception as e:
            self.logger.error(f"Error parsing CPU usage: {e}")
            return 0.0

    def _convert_to_gib(self, value, unit):
        """ Converts memory units to GiB. """
        unit = unit.lower()
        if unit == 'gib':
            return value
        elif unit == 'mib':
            return value / 1024  # Convert MiB to GiB
        else:
            self.logger.warning(f"Unknown memory unit '{unit}'. Assuming GiB.")
            return value  # Assume GiB if unit is unknown

    def _parse_ram_usage(self, output):
        """ Parses RAM usage from VM status output. """
        try:
            output_str = self._get_command_output(output)
            self.logger.debug(f"Processing output: '{output_str}'")
            # ----------------------------
            # Extract Memory Values
            # ----------------------------
            # Pattern Explanation:
            # - (\d+(?:\.\d+)?)\s+(GiB|MiB) : Capture first memory value and its unit
            # - \s+                         : Match one or more whitespace characters
            # - (\d+(?:\.\d+)?)\s+(GiB|MiB) : Capture second memory value and its unit
            pattern_memory = r"(\d+(?:\.\d+)?)\s+(GiB|MiB)\s+(\d+(?:\.\d+)?)\s+(GiB|MiB)"
            memory_match = re.search(pattern_memory, output_str)
            if memory_match:
                max_mem_value = float(memory_match.group(1))
                max_mem_unit = memory_match.group(2)
                used_mem_value = float(memory_match.group(3))
                used_mem_unit = memory_match.group(4)

                self.logger.debug(f"Extracted Max Memory: {max_mem_value} {max_mem_unit}")
                self.logger.debug(f"Extracted Used Memory: {used_mem_value} {used_mem_unit}")

                # Convert memory values to GiB
                max_mem_gib = self._convert_to_gib(max_mem_value, max_mem_unit)
                used_mem_gib = self._convert_to_gib(used_mem_value, used_mem_unit)

                self.logger.debug(f"Converted Max Memory: {max_mem_gib} GiB")
                self.logger.debug(f"Converted Used Memory: {used_mem_gib} GiB")

                if max_mem_gib == 0:
                    self.logger.warning("Maximum memory is zero. Cannot compute usage percentage.")
                    return 0.0

                # Calculate RAM usage percentage based on memory values
                usage_percentage = (used_mem_gib / max_mem_gib) * 100
                self.logger.debug(f"Calculated RAM Usage: {usage_percentage:.2f}%")
                return usage_percentage
            else:
                self.logger.warning("RAM memory values not found in output.")
                return 0.0

        except Exception as e:
            self.logger.error(f"Error parsing RAM usage: {e}")
            return 0.0

    def _get_current_vcpus(self):
        """Retrieve current vCPUs assigned to the VM."""
        try:
            command = f"qm config {self.vm_id}"
            output = self.ssh_client.execute_command(command)
            output_str = self._get_command_output(output)
            match = re.search(r"vcpus:\s*(\d+)", output_str)
            return int(match.group(1)) if match else 1
        except Exception as e:
            self.logger.error(f"Failed to retrieve vCPUs: {e}")
            return 1

    def _get_current_cores(self):
        """Retrieve current CPU cores assigned to the VM."""
        try:
            command = f"qm config {self.vm_id}"
            output = self.ssh_client.execute_command(command)
            output_str = self._get_command_output(output)
            match = re.search(r"cores:\s*(\d+)", output_str)
            return int(match.group(1)) if match else 1
        except Exception as e:
            self.logger.error(f"Failed to retrieve CPU cores: {e}")
            return 1

    def _get_max_cores(self):
        """Retrieve maximum allowed CPU cores."""
        return self.config.get("max_cores", 8)

    def _get_min_cores(self):
        """Retrieve minimum allowed CPU cores."""
        return self.config.get("min_cores", 1)

    def _get_current_ram(self):
        """Retrieve current RAM assigned to the VM."""
        try:
            command = f"qm config {self.vm_id}"
            output = self.ssh_client.execute_command(command)
            output_str = self._get_command_output(output)
            match = re.search(r"memory:\s*(\d+)", output_str)
            return int(match.group(1)) if match else 512
        except Exception as e:
            self.logger.error(f"Failed to retrieve current RAM: {e}")
            return 512

    def _get_max_ram(self):
        """Retrieve maximum allowed RAM."""
        return self.config.get("max_ram", 16384)

    def _get_min_ram(self):
        """Retrieve minimum allowed RAM."""
        return self.config.get("min_ram", 512)

    def _check_hotplug_enabled(self):
        """Check if hotplug is enabled for CPU and memory on this VM."""
        try:
            command = f"qm config {self.vm_id}"
            output = self.ssh_client.execute_command(command)
            output_str = self._get_command_output(output)

            # Check for hotplug setting (e.g., "hotplug: cpu,memory" or "hotplug: network,disk,cpu,memory")
            hotplug_match = re.search(r"hotplug:\s*([^\n]+)", output_str)
            if hotplug_match:
                hotplug_settings = hotplug_match.group(1).lower()
                cpu_hotplug = 'cpu' in hotplug_settings
                memory_hotplug = 'memory' in hotplug_settings
                return cpu_hotplug, memory_hotplug

            # If no hotplug line, hotplug is disabled
            return False, False
        except Exception as e:
            self.logger.error(f"Failed to check hotplug settings: {e}")
            return False, False

    def _check_numa_enabled(self):
        """Check if NUMA is enabled on this VM (required for memory hotplug)."""
        try:
            command = f"qm config {self.vm_id}"
            output = self.ssh_client.execute_command(command)
            output_str = self._get_command_output(output)

            # Check for numa setting
            numa_match = re.search(r"numa:\s*(\d+)", output_str)
            if numa_match:
                return int(numa_match.group(1)) == 1
            return False
        except Exception as e:
            self.logger.error(f"Failed to check NUMA settings: {e}")
            return False

    def _get_balloon_value(self):
        """Get current balloon memory value."""
        try:
            command = f"qm config {self.vm_id}"
            output = self.ssh_client.execute_command(command)
            output_str = self._get_command_output(output)
            match = re.search(r"balloon:\s*(\d+)", output_str)
            return int(match.group(1)) if match else None
        except Exception as e:
            self.logger.debug(f"Failed to get balloon value: {e}")
            return None

    def _set_ram(self, ram):
        """Set the RAM for the VM, using balloon for hotplug if available."""
        try:
            is_running = self.is_vm_running()
            _, memory_hotplug = self._check_hotplug_enabled()
            numa_enabled = self._check_numa_enabled()

            if is_running and memory_hotplug and numa_enabled:
                # Use balloon for immediate effect on running VMs with hotplug
                command = f"qm set {self.vm_id} -balloon {ram}"
                output = self.ssh_client.execute_command(command)
                self._get_command_output(output)
                self.logger.info(f"RAM balloon set to {ram} MB for VM {self.vm_id} (hotplug applied).")
            elif is_running and memory_hotplug and not numa_enabled:
                # Hotplug enabled but NUMA not - this won't work properly
                self.logger.warning(
                    f"VM {self.vm_id} has memory hotplug enabled but NUMA is disabled. "
                    "Memory changes will require a reboot. Enable NUMA for live memory scaling."
                )
                command = f"qm set {self.vm_id} -memory {ram}"
                output = self.ssh_client.execute_command(command)
                self._get_command_output(output)
                self.logger.info(f"RAM config set to {ram} MB for VM {self.vm_id} (requires reboot).")
            elif is_running:
                # No hotplug - warn and set config only
                self.logger.warning(
                    f"VM {self.vm_id} does not have memory hotplug enabled. "
                    "Memory changes will require a reboot. Enable 'hotplug: memory' and NUMA for live scaling."
                )
                command = f"qm set {self.vm_id} -memory {ram}"
                output = self.ssh_client.execute_command(command)
                self._get_command_output(output)
                self.logger.info(f"RAM config set to {ram} MB for VM {self.vm_id} (requires reboot).")
            else:
                # VM not running - just set memory config
                command = f"qm set {self.vm_id} -memory {ram}"
                output = self.ssh_client.execute_command(command)
                self._get_command_output(output)
                self.logger.info(f"RAM set to {ram} MB for VM {self.vm_id}.")
        except Exception as e:
            self.logger.error(f"Failed to set RAM to {ram}: {e}")
            raise

    def _scale_cpu_up(self, current_cores, current_vcpus):
        """Helper method to scale CPU up, using hotplug when available."""
        is_running = self.is_vm_running()
        cpu_hotplug, _ = self._check_hotplug_enabled()

        if is_running and cpu_hotplug:
            # For hotplug: prefer adjusting vcpus within current cores limit
            if current_vcpus < current_cores:
                # We can increase vcpus without changing cores
                new_vcpus = current_vcpus + 1
                self._set_vcpus(new_vcpus)
                self.logger.info(f"Scaled up vCPUs to {new_vcpus} for VM {self.vm_id} (hotplug applied).")
            else:
                # vcpus == cores, need to increase cores (config change) then vcpus
                new_cores = current_cores + 1
                self._set_cores(new_cores)
                new_vcpus = current_vcpus + 1
                self._set_vcpus(new_vcpus)
                self.logger.warning(
                    f"VM {self.vm_id}: Increased cores to {new_cores} (requires reboot for full effect) "
                    f"and vCPUs to {new_vcpus} (hotplug applied)."
                )
        elif is_running:
            # No hotplug - config change only, warn user
            new_cores = current_cores + 1
            self._set_cores(new_cores)
            new_vcpus = min(current_vcpus + 1, new_cores)
            self._set_vcpus(new_vcpus)
            self.logger.warning(
                f"VM {self.vm_id} does not have CPU hotplug enabled. "
                "CPU changes will require a reboot. Enable 'hotplug: cpu' for live CPU scaling."
            )
        else:
            # VM not running - just set config
            new_cores = current_cores + 1
            self._set_cores(new_cores)
            new_vcpus = min(current_vcpus + 1, new_cores)
            self._set_vcpus(new_vcpus)

    def _scale_cpu_down(self, current_cores, current_vcpus):
        """Helper method to scale CPU down, using hotplug when available."""
        is_running = self.is_vm_running()
        cpu_hotplug, _ = self._check_hotplug_enabled()

        if is_running and cpu_hotplug:
            # For hotplug: reduce vcpus first (immediate effect)
            new_vcpus = max(current_vcpus - 1, 1)
            self._set_vcpus(new_vcpus)
            self.logger.info(f"Scaled down vCPUs to {new_vcpus} for VM {self.vm_id} (hotplug applied).")

            # Optionally reduce cores if vcpus is significantly lower
            # (cores change requires reboot, so we only do it when it makes sense)
            new_cores = current_cores - 1
            if new_cores >= new_vcpus and new_cores >= self._get_min_cores():
                self._set_cores(new_cores)
                self.logger.info(
                    f"Also reduced cores config to {new_cores} for VM {self.vm_id} "
                    "(will take effect after reboot)."
                )
        elif is_running:
            # No hotplug - config change only, warn user
            new_vcpus = max(current_vcpus - 1, 1)
            self._set_vcpus(new_vcpus)
            new_cores = current_cores - 1
            self._set_cores(new_cores)
            self.logger.warning(
                f"VM {self.vm_id} does not have CPU hotplug enabled. "
                "CPU changes will require a reboot. Enable 'hotplug: cpu' for live CPU scaling."
            )
        else:
            # VM not running - just set config
            new_vcpus = max(current_vcpus - 1, 1)
            self._set_vcpus(new_vcpus)
            new_cores = current_cores - 1
            self._set_cores(new_cores)

    def _set_cores(self, cores):
        """Set the CPU cores for the VM (config change, requires reboot for running VMs)."""
        try:
            command = f"qm set {self.vm_id} -cores {cores}"
            output = self.ssh_client.execute_command(command)
            self._get_command_output(output)
            self.logger.debug(f"CPU cores config set to {cores} for VM {self.vm_id}.")
        except Exception as e:
            self.logger.error(f"Failed to set CPU cores to {cores}: {e}")
            raise

    def _set_vcpus(self, vcpus):
        """Set the vCPUs for the VM (can be hotplugged if enabled)."""
        try:
            command = f"qm set {self.vm_id} -vcpus {vcpus}"
            output = self.ssh_client.execute_command(command)
            self._get_command_output(output)
            self.logger.debug(f"vCPUs set to {vcpus} for VM {self.vm_id}.")
        except Exception as e:
            self.logger.error(f"Failed to set vCPUs to {vcpus}: {e}")
            raise