Skip to content

Commit 46ca005

Browse files
committed
polish(pu): add device logs
1 parent fbab0b9 commit 46ca005

File tree

3 files changed

+72
-13
lines changed

3 files changed

+72
-13
lines changed

ding/policy/base_policy.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,19 @@ def __init__(
169169
if self._use_accelerator:
170170
move_to_device(model, self._device_type, self._rank)
171171

172+
# Print final device configuration summary
173+
print(f"\n{'='*70}")
174+
print(f"🎉 [DI-engine Policy] Device Setup Complete")
175+
print(f"{'='*70}")
176+
print(f" Policy Type: {self.__class__.__name__}")
177+
print(f" Device Type: {self._device_type.upper()}")
178+
print(f" Device String: {self._device}")
179+
print(f" Using Accelerator: {self._use_accelerator}")
180+
print(f" Rank: {self._rank}")
181+
print(f" Multi-GPU: {self._cfg.multi_gpu if hasattr(self._cfg, 'multi_gpu') else False}")
182+
print(f" Legacy _cuda flag: {self._cuda}")
183+
print(f"{'='*70}\n")
184+
172185
# Multi-GPU initialization
173186
if len(set(self._enable_field).intersection(set(['learn']))) > 0:
174187
multi_gpu = self._cfg.multi_gpu

ding/torch_utils/device_helper.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,42 @@ def get_available_device() -> Tuple[str, bool]:
3131
>>> device_type, is_accelerator = get_available_device()
3232
>>> print(f"Using device: {device_type}")
3333
"""
34+
print("\n" + "="*70)
35+
print("🔍 [DI-engine] Device Detection")
36+
print("="*70)
37+
3438
# Check for NPU first (Huawei Ascend)
35-
if TORCH_NPU_AVAILABLE and torch.npu.is_available():
36-
npu_count = torch.npu.device_count()
37-
logger.info(f"Detected {npu_count} NPU device(s), using NPU")
38-
return 'npu', True
39+
if TORCH_NPU_AVAILABLE:
40+
print("✓ torch_npu module is installed")
41+
if torch.npu.is_available():
42+
npu_count = torch.npu.device_count()
43+
print(f"✓ NPU is available: {npu_count} device(s) detected")
44+
print(f"✓ NPU device names: {[torch.npu.get_device_name(i) for i in range(npu_count)]}")
45+
print(f"🎯 Selected device: NPU")
46+
print("="*70 + "\n")
47+
logger.info(f"[Device] Using NPU with {npu_count} device(s)")
48+
return 'npu', True
49+
else:
50+
print("✗ NPU is not available")
51+
else:
52+
print("✗ torch_npu module is not installed")
3953

4054
# Check for CUDA GPU
4155
if torch.cuda.is_available():
4256
gpu_count = torch.cuda.device_count()
43-
logger.info(f"Detected {gpu_count} CUDA GPU device(s), using GPU")
57+
print(f"✓ CUDA is available: {gpu_count} device(s) detected")
58+
print(f"✓ GPU device names: {[torch.cuda.get_device_name(i) for i in range(gpu_count)]}")
59+
print(f"🎯 Selected device: CUDA GPU")
60+
print("="*70 + "\n")
61+
logger.info(f"[Device] Using CUDA GPU with {gpu_count} device(s)")
4462
return 'cuda', True
63+
else:
64+
print("✗ CUDA is not available")
4565

4666
# Fallback to CPU
47-
logger.info("No NPU or GPU detected, using CPU")
67+
print("🎯 Selected device: CPU (no accelerator detected)")
68+
print("="*70 + "\n")
69+
logger.info("[Device] Using CPU (no accelerator available)")
4870
return 'cpu', False
4971

5072

@@ -80,13 +102,18 @@ def move_to_device(model: torch.nn.Module, device_type: str, rank: int = 0) -> t
80102
if device_type == 'npu' and TORCH_NPU_AVAILABLE:
81103
device_count = torch.npu.device_count()
82104
device_id = rank % device_count if device_count > 0 else 0
105+
print(f"📦 [DI-engine] Moving model to NPU device {device_id} (rank={rank})")
83106
model.npu(device_id)
84-
logger.debug(f"Moved model to NPU device {device_id}")
107+
logger.info(f"[Device] Model moved to NPU device {device_id}")
85108
elif device_type == 'cuda':
86109
device_count = torch.cuda.device_count()
87110
device_id = rank % device_count if device_count > 0 else 0
111+
print(f"📦 [DI-engine] Moving model to CUDA device {device_id} (rank={rank})")
88112
model.cuda(device_id)
89-
logger.debug(f"Moved model to CUDA device {device_id}")
113+
logger.info(f"[Device] Model moved to CUDA device {device_id}")
114+
else:
115+
print(f"📦 [DI-engine] Model will stay on CPU")
116+
logger.info("[Device] Model stays on CPU")
90117
# CPU case: no need to move
91118
return model
92119

@@ -128,38 +155,52 @@ def auto_device_init(cfg_device: Optional[str], rank: int = 0) -> Tuple[str, boo
128155
>>> # Returns ('cuda', True, 'cuda:0') if GPU available
129156
>>> # Returns ('cpu', False, 'cpu') otherwise
130157
"""
158+
print(f"\n⚙️ [DI-engine] Device Configuration: cfg_device='{cfg_device}', rank={rank}")
159+
131160
# Default to auto detection if not specified
132161
if cfg_device is None or cfg_device == 'auto':
162+
print(f"🔧 [DI-engine] Using auto-detection mode")
133163
device_type, use_accelerator = get_available_device()
134164
else:
135165
# Explicit device type specified
136166
device_type = cfg_device.lower()
167+
print(f"🔧 [DI-engine] Explicit device type requested: '{device_type}'")
137168

138169
# Validate the device type is available
139170
if device_type == 'npu':
140171
if TORCH_NPU_AVAILABLE and torch.npu.is_available():
141172
use_accelerator = True
142-
logger.info("Using NPU as explicitly configured")
173+
npu_count = torch.npu.device_count()
174+
print(f"✓ NPU requested and available: {npu_count} device(s)")
175+
logger.info(f"[Device] Using NPU as explicitly configured ({npu_count} device(s))")
143176
else:
144-
logger.warning("NPU requested but not available, falling back to CPU")
177+
print(f"⚠️ NPU requested but not available, falling back to CPU")
178+
logger.warning("[Device] NPU requested but not available, falling back to CPU")
145179
device_type = 'cpu'
146180
use_accelerator = False
147181
elif device_type == 'cuda':
148182
if torch.cuda.is_available():
149183
use_accelerator = True
150-
logger.info("Using CUDA GPU as explicitly configured")
184+
gpu_count = torch.cuda.device_count()
185+
print(f"✓ CUDA requested and available: {gpu_count} device(s)")
186+
logger.info(f"[Device] Using CUDA GPU as explicitly configured ({gpu_count} device(s))")
151187
else:
152-
logger.warning("CUDA requested but not available, falling back to CPU")
188+
print(f"⚠️ CUDA requested but not available, falling back to CPU")
189+
logger.warning("[Device] CUDA requested but not available, falling back to CPU")
153190
device_type = 'cpu'
154191
use_accelerator = False
155192
else:
156193
# CPU or any other value
157194
device_type = 'cpu'
158195
use_accelerator = False
159-
logger.info("Using CPU as configured")
196+
print(f"✓ Using CPU as configured")
197+
logger.info("[Device] Using CPU as configured")
160198

161199
device_str = get_device_string(device_type, rank)
162200

201+
print(f"✅ [DI-engine] Device initialized: type={device_type}, accelerator={use_accelerator}, device_string='{device_str}'")
202+
print("="*70 + "\n")
203+
163204
return device_type, use_accelerator, device_str
164205

165206

ding/utils/default_helper.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,18 +438,23 @@ def set_pkg_seed(seed: int, use_cuda: bool = True) -> None:
438438
>>> ...
439439
440440
"""
441+
print(f"\n🌱 [DI-engine] Setting random seed: {seed}")
441442
random.seed(seed)
442443
np.random.seed(seed)
443444
torch.manual_seed(seed)
445+
print(f" ✓ Set seed for: random, numpy, torch")
444446

445447
# Set seed for accelerators (GPU or NPU)
446448
if use_cuda:
447449
# Set CUDA seed if available
448450
if torch.cuda.is_available():
449451
torch.cuda.manual_seed(seed)
452+
print(f" ✓ Set CUDA seed: {seed}")
450453
# Set NPU seed if available
451454
if TORCH_NPU_AVAILABLE and torch.npu.is_available():
452455
torch.npu.manual_seed(seed)
456+
print(f" ✓ Set NPU seed: {seed}")
457+
print()
453458

454459

455460
@lru_cache()

0 commit comments

Comments
 (0)