diff --git a/python/infinilm/generation/utils.py b/python/infinilm/generation/utils.py index 4da145cd..31af58fe 100644 --- a/python/infinilm/generation/utils.py +++ b/python/infinilm/generation/utils.py @@ -11,32 +11,66 @@ def infini_to_ctype_dtype(infini_dtype): if infini_dtype == infinicore.int32: return ctypes.c_int32 + elif infini_dtype == infinicore.int64: + return ctypes.c_int64 elif infini_dtype == infinicore.float32: return ctypes.c_float + elif infini_dtype == infinicore.bfloat16: + # bfloat16 uses uint16 to read raw bytes + return ctypes.c_uint16 else: raise ValueError(f"Unsupported py_dtype: {infini_dtype}") def infini_to_numpy(infini_tensor: infinicore.Tensor): + # Ensure data is on CPU if infini_tensor.device.type != "cpu": infini_tensor_cpu = infini_tensor.to(infinicore.device("cpu", 0)) + # Sync to ensure copy is complete + infinicore.sync_stream() else: infini_tensor_cpu = infini_tensor - # 获取数据指针和形状信息 + # Get data pointer and shape information data_ptr = infini_tensor_cpu.data_ptr() num_elements = infini_tensor_cpu.numel() original_shape = infini_tensor_cpu.shape - # 创建1D NumPy数组(共享内存) - ArrayType = infini_to_ctype_dtype(infini_tensor_cpu.dtype) * num_elements - array = ArrayType.from_address(data_ptr) - np_flat = np.ctypeslib.as_array(array) - - # 重塑为原始形状 - np_array = np_flat.reshape(original_shape) - - return np.copy(np_array) + # Special handling for bfloat16 + if infini_tensor_cpu.dtype == infinicore.bfloat16: + # bfloat16 is 16-bit, read as uint16 + import ctypes + # Use safer approach: allocate memory first, then copy + buffer = (ctypes.c_uint16 * num_elements)() + ctypes.memmove(ctypes.addressof(buffer), data_ptr, num_elements * 2) # 2 bytes per uint16 + np_uint16 = np.array(buffer, dtype=np.uint16, copy=True) + + # Convert uint16 to float32 + # bfloat16 memory layout: shift uint16 left by 16 bits, then read as float32 + np_uint32 = np_uint16.astype(np.uint32) << 16 + np_array = np_uint32.view(np.float32).reshape(original_shape) + else: + # Determine element size and numpy dtype based on dtype + dtype_info_map = { + infinicore.int32: (4, np.int32), + infinicore.int64: (8, np.int64), + infinicore.float32: (4, np.float32), + } + element_size, np_dtype = dtype_info_map.get(infini_tensor_cpu.dtype, (4, np.float32)) + + # Use safer approach: allocate memory first, then copy + import ctypes + ctype = infini_to_ctype_dtype(infini_tensor_cpu.dtype) + buffer = (ctype * num_elements)() + ctypes.memmove(ctypes.addressof(buffer), data_ptr, num_elements * element_size) + + # Convert to numpy array (using np.array instead of frombuffer, safer) + np_flat = np.array(buffer, dtype=np_dtype, copy=True) + + # Reshape to original shape + np_array = np_flat.reshape(original_shape) + + return np_array infinicore.Tensor.to_numpy = infini_to_numpy @@ -197,6 +231,8 @@ def _sample( # -------------------------------------------------------------------------- # start_time = time.time() logits = self(**model_inputs) + # Ensure computation is complete - sync stream before reading logits + infinicore.sync_stream() # -------------------------------------------------------------------------- # # 处理输出 @@ -225,7 +261,7 @@ def _sample( out=out, ) - infinicore.sync_stream() # 计算结束前需要同步 + infinicore.sync_stream() # Sync before computation ends end_time = time.time() time_list.append((end_time - start_time) * 1000) @@ -245,11 +281,14 @@ def _sample( break print("\n") - print( - f"\n\n\n Time per step: prefill {round(time_list[0], 2)} token/ms\n", - ) - print( - f" Time per step: decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n", - ) + + if len(time_list) > 0: + print( + f"\n\n\n Time per step: prefill {round(time_list[0], 2)} token/ms\n", + ) + if len(time_list) > 1: + print( + f" Time per step: decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n", + ) return output_tokens_list, output_content