@@ -31,20 +31,42 @@ def get_available_device() -> Tuple[str, bool]:
3131 >>> device_type, is_accelerator = get_available_device()
3232 >>> print(f"Using device: {device_type}")
3333 """
34+ print ("\n " + "=" * 70 )
35+ print ("🔍 [DI-engine] Device Detection" )
36+ print ("=" * 70 )
37+
3438 # Check for NPU first (Huawei Ascend)
35- if TORCH_NPU_AVAILABLE and torch .npu .is_available ():
36- npu_count = torch .npu .device_count ()
37- logger .info (f"Detected { npu_count } NPU device(s), using NPU" )
38- return 'npu' , True
39+ if TORCH_NPU_AVAILABLE :
40+ print ("✓ torch_npu module is installed" )
41+ if torch .npu .is_available ():
42+ npu_count = torch .npu .device_count ()
43+ print (f"✓ NPU is available: { npu_count } device(s) detected" )
44+ print (f"✓ NPU device names: { [torch .npu .get_device_name (i ) for i in range (npu_count )]} " )
45+ print (f"🎯 Selected device: NPU" )
46+ print ("=" * 70 + "\n " )
47+ logger .info (f"[Device] Using NPU with { npu_count } device(s)" )
48+ return 'npu' , True
49+ else :
50+ print ("✗ NPU is not available" )
51+ else :
52+ print ("✗ torch_npu module is not installed" )
3953
4054 # Check for CUDA GPU
4155 if torch .cuda .is_available ():
4256 gpu_count = torch .cuda .device_count ()
43- logger .info (f"Detected { gpu_count } CUDA GPU device(s), using GPU" )
57+ print (f"✓ CUDA is available: { gpu_count } device(s) detected" )
58+ print (f"✓ GPU device names: { [torch .cuda .get_device_name (i ) for i in range (gpu_count )]} " )
59+ print (f"🎯 Selected device: CUDA GPU" )
60+ print ("=" * 70 + "\n " )
61+ logger .info (f"[Device] Using CUDA GPU with { gpu_count } device(s)" )
4462 return 'cuda' , True
63+ else :
64+ print ("✗ CUDA is not available" )
4565
4666 # Fallback to CPU
47- logger .info ("No NPU or GPU detected, using CPU" )
67+ print ("🎯 Selected device: CPU (no accelerator detected)" )
68+ print ("=" * 70 + "\n " )
69+ logger .info ("[Device] Using CPU (no accelerator available)" )
4870 return 'cpu' , False
4971
5072
@@ -80,13 +102,18 @@ def move_to_device(model: torch.nn.Module, device_type: str, rank: int = 0) -> t
80102 if device_type == 'npu' and TORCH_NPU_AVAILABLE :
81103 device_count = torch .npu .device_count ()
82104 device_id = rank % device_count if device_count > 0 else 0
105+ print (f"📦 [DI-engine] Moving model to NPU device { device_id } (rank={ rank } )" )
83106 model .npu (device_id )
84- logger .debug (f"Moved model to NPU device { device_id } " )
107+ logger .info (f"[Device] Model moved to NPU device { device_id } " )
85108 elif device_type == 'cuda' :
86109 device_count = torch .cuda .device_count ()
87110 device_id = rank % device_count if device_count > 0 else 0
111+ print (f"📦 [DI-engine] Moving model to CUDA device { device_id } (rank={ rank } )" )
88112 model .cuda (device_id )
89- logger .debug (f"Moved model to CUDA device { device_id } " )
113+ logger .info (f"[Device] Model moved to CUDA device { device_id } " )
114+ else :
115+ print (f"📦 [DI-engine] Model will stay on CPU" )
116+ logger .info ("[Device] Model stays on CPU" )
90117 # CPU case: no need to move
91118 return model
92119
@@ -128,38 +155,52 @@ def auto_device_init(cfg_device: Optional[str], rank: int = 0) -> Tuple[str, boo
128155 >>> # Returns ('cuda', True, 'cuda:0') if GPU available
129156 >>> # Returns ('cpu', False, 'cpu') otherwise
130157 """
158+ print (f"\n ⚙️ [DI-engine] Device Configuration: cfg_device='{ cfg_device } ', rank={ rank } " )
159+
131160 # Default to auto detection if not specified
132161 if cfg_device is None or cfg_device == 'auto' :
162+ print (f"🔧 [DI-engine] Using auto-detection mode" )
133163 device_type , use_accelerator = get_available_device ()
134164 else :
135165 # Explicit device type specified
136166 device_type = cfg_device .lower ()
167+ print (f"🔧 [DI-engine] Explicit device type requested: '{ device_type } '" )
137168
138169 # Validate the device type is available
139170 if device_type == 'npu' :
140171 if TORCH_NPU_AVAILABLE and torch .npu .is_available ():
141172 use_accelerator = True
142- logger .info ("Using NPU as explicitly configured" )
173+ npu_count = torch .npu .device_count ()
174+ print (f"✓ NPU requested and available: { npu_count } device(s)" )
175+ logger .info (f"[Device] Using NPU as explicitly configured ({ npu_count } device(s))" )
143176 else :
144- logger .warning ("NPU requested but not available, falling back to CPU" )
177+ print (f"⚠️ NPU requested but not available, falling back to CPU" )
178+ logger .warning ("[Device] NPU requested but not available, falling back to CPU" )
145179 device_type = 'cpu'
146180 use_accelerator = False
147181 elif device_type == 'cuda' :
148182 if torch .cuda .is_available ():
149183 use_accelerator = True
150- logger .info ("Using CUDA GPU as explicitly configured" )
184+ gpu_count = torch .cuda .device_count ()
185+ print (f"✓ CUDA requested and available: { gpu_count } device(s)" )
186+ logger .info (f"[Device] Using CUDA GPU as explicitly configured ({ gpu_count } device(s))" )
151187 else :
152- logger .warning ("CUDA requested but not available, falling back to CPU" )
188+ print (f"⚠️ CUDA requested but not available, falling back to CPU" )
189+ logger .warning ("[Device] CUDA requested but not available, falling back to CPU" )
153190 device_type = 'cpu'
154191 use_accelerator = False
155192 else :
156193 # CPU or any other value
157194 device_type = 'cpu'
158195 use_accelerator = False
159- logger .info ("Using CPU as configured" )
196+ print (f"✓ Using CPU as configured" )
197+ logger .info ("[Device] Using CPU as configured" )
160198
161199 device_str = get_device_string (device_type , rank )
162200
201+ print (f"✅ [DI-engine] Device initialized: type={ device_type } , accelerator={ use_accelerator } , device_string='{ device_str } '" )
202+ print ("=" * 70 + "\n " )
203+
163204 return device_type , use_accelerator , device_str
164205
165206
0 commit comments