Skip to content

Commit 2bb9522

Browse files
authored
Update gpu_offset_control_v2
Now with ability to monitor GPU voltages plus some cleanup and improved help section
1 parent ec9a113 commit 2bb9522

File tree

1 file changed

+149
-22
lines changed

1 file changed

+149
-22
lines changed

gpu_offset_control_v2

Lines changed: 149 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ import time
99
import argparse
1010
from pynvml import *
1111
import ctypes
12+
import subprocess
13+
import re
1214

1315
# ===== USER CONFIGURABLE PARAMETERS =====
1416
CONFIG = {
1517
# Clock frequency limits (MHz)
1618
'min_clock': 210,
17-
'max_clock': 1800,
19+
'max_clock': 1740,
1820

1921
# Temperature thresholds (°C)
2022
'temperature_min': 20,
@@ -34,12 +36,12 @@ CONFIG = {
3436

3537
# Low frequency range settings
3638
'low_freq_min': 1000,
37-
'low_freq_max': 1450,
39+
'low_freq_max': 1440,
3840
'drain_offset_lmin': -30,
3941
'drain_offset_lmax': 0,
4042

4143
# High frequency range settings
42-
'high_freq_min': 1450,
44+
'high_freq_min': 1440,
4345
'high_freq_max': 1800,
4446
'drain_offset_hmin': 0,
4547
'drain_offset_hmax': 15,
@@ -53,14 +55,15 @@ CONFIG = {
5355
'power_offset_min': 0,
5456

5557
# Memory clock offset (MHz) - applied via NVML
56-
'memory_offset': 0, # Set non-zero to overclock memory
58+
'memory_offset': 0, # Set non-zero to overclock/underclock memory
5759

5860
# Offset change threshold to prevent micro-adjustments
5961
'offset_change_threshold': 15, # Only apply if offset changes by at least this much (MHz)
6062

6163
# Performance optimizations
6264
'skip_idle_and_low_power_pstates': True, # Skip offset calculations when GPU is idle or low power
6365
'idle_and_low_power_pstates_threshold': 1, # P-states above this are considered idle/low-power
66+
'idle_status_display_interval': 10, # Show idle status every N cycles
6467

6568
# Control flags
6669
'drain_offset_control': True,
@@ -71,6 +74,9 @@ CONFIG = {
7174
# Reset clock limits on exit
7275
'reset_clock_limits_on_exit': False, # Keep clock limits after script stops
7376

77+
# Voltage monitoring legacy path (for nvidia-smi 565 or earlier)
78+
'nvidia_smi_legacy_path': '', # Example: '/path/to/nvidia-smi-565'
79+
7480
# Refresh interval (seconds)
7581
'refresh_interval': 1,
7682

@@ -90,8 +96,8 @@ DESCRIPTION:
9096
and frequency using NVIDIA Management Library (NVML).
9197
9298
REQUIREMENTS:
99+
- NVIDIA GPU
93100
- nvidia-ml-py library (pip install nvidia-ml-py)
94-
- NVIDIA GPU with offset support
95101
- sudo privileges for applying clock limits
96102
97103
USAGE:
@@ -136,6 +142,7 @@ CONFIGURABLE PARAMETERS:
136142
137143
→ Applies when GPU frequency is between low_freq_min and low_freq_max
138144
→ Linearly increases with temperature
145+
→ GPU voltage should be below 700mV
139146
140147
High Frequency Range (drain offset):
141148
high_freq_min High frequency range start (MHz)
@@ -145,6 +152,7 @@ CONFIGURABLE PARAMETERS:
145152
146153
→ Applies when GPU frequency is between high_freq_min and high_freq_max
147154
→ Linearly decreases with temperature
155+
→ GPU voltage should be above 700mV
148156
149157
Power-Based Offset:
150158
power_offset_max Maximum power offset at/below plimit_min (MHz)
@@ -162,14 +170,21 @@ CONFIGURABLE PARAMETERS:
162170
Prevents micro-adjustments that cause stuttering
163171
skip_idle_and_low_power_pstates Skip calculations when GPU is idle/low-power
164172
idle_and_low_power_pstates_threshold P-states above this are idle/low-power
173+
idle_status_display_interval Show idle status every N cycles
165174
166175
Control Flags:
167176
drain_offset_control Enable/disable drain offset (True/False)
168177
power_offset_control Enable/disable power offset (True/False)
169-
critical_temp_range_control Enable/disable critical temp logic (True/False)
178+
critical_temp_range_control Enable/disable critical temp logic
179+
(disable drain offset in temps where voltage fluctuates) (True/False)
170180
show_info Show detailed statistics (True/False)
171181
reset_clock_limits_on_exit Reset clock limits when stopping (True/False)
172182
183+
Voltage Monitoring (nvidia-smi 565 or earlier):
184+
nvidia_smi_legacy_path Path to nvidia-smi binary v565 or earlier
185+
Example: '/opt/nvidia-565/bin/nvidia-smi'
186+
Leave empty to use system nvidia-smi
187+
173188
Refresh Settings:
174189
refresh_interval Update interval in seconds
175190
@@ -183,7 +198,7 @@ OFFSET CALCULATION:
183198
- Applies different offsets for low and high frequency ranges
184199
- Low range: Changes with temperature increase
185200
- High range: Changes with temperature increase
186-
- Critical temp range overrides if enabled
201+
- Critical temp range overrides if enabled to prevent crashes
187202
188203
3. Power Offset (power_offset):
189204
- Maximum offset at low power consumption
@@ -216,9 +231,11 @@ EXAMPLES:
216231
217232
NOTES:
218233
- Only P-state 0 receives calculated offsets
219-
- Non-P0 states use freq_offset_min to prevent crashes during level loading
234+
- Non-P0 states use freq_offset_min to prevent crashes during periods when GPU
235+
is underpowered (in menus, FMV, game emulators, etc.)
220236
- Script requires sudo for setting clock limits
221237
- Press Ctrl+C to stop the script gracefully
238+
- Voltage monitoring requires nvidia-smi version 565 or earlier
222239
223240
╚══════════════════════════════════════════════════════════════════════════════╝
224241
"""
@@ -268,6 +285,88 @@ def smart_round_offset(offset, threshold):
268285
else:
269286
return base_units * threshold
270287

288+
def get_pynvml_version():
289+
"""Get NVML version from the library."""
290+
try:
291+
version = nvmlSystemGetNVMLVersion()
292+
return version
293+
except:
294+
return "unknown"
295+
296+
def get_nvidia_smi_version():
297+
"""Get system nvidia-smi driver version."""
298+
try:
299+
result = subprocess.run(
300+
['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader'],
301+
capture_output=True,
302+
text=True,
303+
timeout=2
304+
)
305+
if result.returncode == 0:
306+
# Parse version like "580.15.03" -> 580
307+
version_str = result.stdout.strip()
308+
match = re.match(r'(\d+)', version_str)
309+
if match:
310+
return int(match.group(1))
311+
except:
312+
pass
313+
return 0
314+
315+
def get_driver_version():
316+
"""Get NVIDIA driver version."""
317+
try:
318+
version = nvmlSystemGetDriverVersion()
319+
# Extract major version number (e.g., "565.57.01" -> 565)
320+
match = re.match(r'(\d+)', version)
321+
if match:
322+
return int(match.group(1))
323+
return 0
324+
except:
325+
return 0
326+
327+
def get_voltage_nvidia_smi(gpu_id, nvidia_smi_path='nvidia-smi'):
328+
"""Get GPU voltage using nvidia-smi -q -d VOLTAGE."""
329+
try:
330+
result = subprocess.run(
331+
[nvidia_smi_path, '-q', '-d', 'VOLTAGE', '-i', str(gpu_id)],
332+
capture_output=True,
333+
text=True,
334+
timeout=2
335+
)
336+
if result.returncode == 0:
337+
match = re.search(r'Graphics\s+:\s+([0-9.]+)\s*mV', result.stdout)
338+
if match:
339+
voltage_mv = float(match.group(1))
340+
return voltage_mv / 1000.0 # Convert to V
341+
except Exception:
342+
pass
343+
return None
344+
345+
def get_gpu_voltage(gpu_id, config, nvidia_smi_version):
346+
"""
347+
Get GPU voltage using nvidia-smi (version 565 or earlier).
348+
349+
Priority:
350+
1. System nvidia-smi (if version 565 or earlier)
351+
2. User-specified legacy nvidia-smi binary
352+
353+
Returns: (voltage_value, voltage_method) or (None, None) if unavailable
354+
"""
355+
356+
# Priority 1: System nvidia-smi version 565 or earlier
357+
if nvidia_smi_version > 0 and nvidia_smi_version <= 565:
358+
voltage = get_voltage_nvidia_smi(gpu_id)
359+
if voltage is not None:
360+
return voltage, f"nvidia-smi {nvidia_smi_version}"
361+
362+
# Priority 2: User-specified legacy nvidia-smi binary
363+
if config.get('nvidia_smi_legacy_path', ''):
364+
voltage = get_voltage_nvidia_smi(gpu_id, config['nvidia_smi_legacy_path'])
365+
if voltage is not None:
366+
return voltage, f"legacy nvidia-smi"
367+
368+
return None, None
369+
271370
def calculate_freq_offset(freq, config):
272371
"""Calculate base frequency offset."""
273372
return linear_interpolate(
@@ -339,7 +438,7 @@ def calculate_power_offset(power, config):
339438
config['power_offset_min']
340439
)
341440

342-
def get_gpu_stats(handle):
441+
def get_gpu_stats(handle, gpu_id, config, nvidia_smi_version):
343442
"""Retrieve current GPU statistics."""
344443
try:
345444
temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
@@ -352,23 +451,19 @@ def get_gpu_stats(handle):
352451
except:
353452
pstate = 0 # Assume P0 if unable to get P-state
354453

355-
# Get voltage if available
356-
voltage_available = False
357-
try:
358-
voltage = nvmlDeviceGetClocksThrottleReasons(handle)
359-
voltage_available = True
360-
except:
361-
voltage_available = False
454+
# Try to get voltage
455+
voltage_value, voltage_method = get_gpu_voltage(gpu_id, config, nvidia_smi_version)
362456

363457
return {
364458
'temperature': temp,
365459
'power': power,
366460
'frequency': clock,
367461
'pstate': pstate,
368-
'voltage_available': voltage_available
462+
'voltage_value': voltage_value,
463+
'voltage_method': voltage_method
369464
}
370465
except NVMLError as e:
371-
print(f"Error getting GPU stats: {e}")
466+
print(f"Error getting GPU stats: {e}", file=sys.stderr)
372467
return None
373468

374469
def apply_clock_limits(handle, config):
@@ -433,8 +528,9 @@ def display_stats(stats, offsets, total_offset_raw, total_offset, config, status
433528
print(f" Temperature: {stats['temperature']:>6}°C")
434529
print(f" Power: {stats['power']:>6.1f} W")
435530

436-
if stats['voltage_available']:
437-
print(f" Voltage Info: Available")
531+
# Show voltage if available
532+
if stats['voltage_value'] is not None:
533+
print(f" Voltage: {stats['voltage_value']:>6.3f} V")
438534

439535
if stats['pstate'] == 0:
440536
print(f"\nOffset Breakdown:")
@@ -483,7 +579,38 @@ def main():
483579
# Get GPU handle
484580
handle = nvmlDeviceGetHandleByIndex(args.device)
485581
gpu_name = nvmlDeviceGetName(handle)
582+
driver_version = get_driver_version()
583+
pynvml_version = get_pynvml_version()
584+
nvidia_smi_version = get_nvidia_smi_version()
585+
486586
print(f"\n🎮 GPU Device: {gpu_name} (ID: {args.device})")
587+
print(f"📦 Driver Version: {driver_version}")
588+
print(f"📦 NVML Version: {pynvml_version}")
589+
print(f"📦 nvidia-smi Driver: {nvidia_smi_version}")
590+
591+
# Check voltage monitoring capability
592+
if nvidia_smi_version > 0 and nvidia_smi_version <= 565:
593+
print("✓ Voltage monitoring: Available (system nvidia-smi)")
594+
elif CONFIG.get('nvidia_smi_legacy_path', ''):
595+
print(f"⚠️ Voltage monitoring: Using legacy nvidia-smi")
596+
print(f" → Path: {CONFIG['nvidia_smi_legacy_path']}")
597+
# Test legacy binary
598+
try:
599+
test_result = subprocess.run(
600+
[CONFIG['nvidia_smi_legacy_path'], '--version'],
601+
capture_output=True,
602+
text=True,
603+
timeout=2
604+
)
605+
if test_result.returncode == 0:
606+
print(f" → Test: OK")
607+
else:
608+
print(f" → Test: FAILED (return code {test_result.returncode})")
609+
except Exception as e:
610+
print(f" → Test: ERROR - {e}")
611+
else:
612+
print("⚠️ Voltage monitoring: Not available (nvidia-smi > 565)")
613+
print(" → Configure 'nvidia_smi_legacy_path' if needed")
487614

488615
# Apply clock limits once
489616
print("\n📊 Applying initial settings...")
@@ -511,7 +638,7 @@ def main():
511638
loop_start = time.time()
512639

513640
# Get current GPU stats
514-
stats = get_gpu_stats(handle)
641+
stats = get_gpu_stats(handle, args.device, CONFIG, nvidia_smi_version)
515642
if not stats:
516643
time.sleep(CONFIG['refresh_interval'])
517644
continue
@@ -523,7 +650,7 @@ def main():
523650
if is_idle_or_low_power:
524651
idle_count += 1
525652
# Display status periodically when idle
526-
if CONFIG['show_info'] and idle_count % 5 == 0:
653+
if CONFIG['show_info'] and idle_count % CONFIG['idle_status_display_interval'] == 0:
527654
print(f"\n{'='*80}")
528655
print(f"GPU Status: [IDLE/LOW-POWER - P{stats['pstate']}]")
529656
print(f" Frequency: {stats['frequency']:>6} MHz")

0 commit comments

Comments
 (0)