1
1
import numpy as np
2
2
import torch
3
+ import time
3
4
from typing import Dict , List , Tuple , Optional
4
5
from modelscope .pipelines import pipeline
5
6
from modelscope .utils .constant import Tasks
@@ -53,13 +54,28 @@ def extract_voiceprint(self, audio_path: str) -> np.ndarray:
53
54
Returns:
54
55
np.ndarray: 声纹特征向量
55
56
"""
57
+ start_time = time .time ()
58
+ logger .info (f"开始提取声纹特征,音频文件: { audio_path } " )
59
+
56
60
try :
61
+ pipeline_start = time .time ()
57
62
result = self ._pipeline ([audio_path ], output_emb = True )
63
+ pipeline_time = time .time () - pipeline_start
64
+ logger .info (f"模型推理完成,耗时: { pipeline_time :.3f} 秒" )
65
+
66
+ convert_start = time .time ()
58
67
emb = self ._to_numpy (result ["embs" ][0 ]).astype (np .float32 )
59
- logger .debug (f"声纹特征提取成功,维度: { emb .shape } " )
68
+ convert_time = time .time () - convert_start
69
+ logger .info (f"数据转换完成,耗时: { convert_time :.3f} 秒" )
70
+
71
+ total_time = time .time () - start_time
72
+ logger .info (
73
+ f"声纹特征提取成功,维度: { emb .shape } ,总耗时: { total_time :.3f} 秒"
74
+ )
60
75
return emb
61
76
except Exception as e :
62
- logger .error (f"声纹特征提取失败: { e } " )
77
+ total_time = time .time () - start_time
78
+ logger .error (f"声纹特征提取失败,总耗时: { total_time :.3f} 秒,错误: { e } " )
63
79
raise
64
80
65
81
def calculate_similarity (self , emb1 : np .ndarray , emb2 : np .ndarray ) -> float :
@@ -138,30 +154,56 @@ def identify_voiceprint(
138
154
Returns:
139
155
Tuple[str, float]: (识别出的说话人ID, 相似度分数)
140
156
"""
157
+ start_time = time .time ()
158
+ logger .info (f"开始声纹识别流程,候选说话人数量: { len (speaker_ids )} " )
159
+
141
160
audio_path = None
142
161
try :
143
162
# 验证音频文件
163
+ validation_start = time .time ()
144
164
if not audio_processor .validate_audio_file (audio_bytes ):
145
165
logger .warning ("音频文件验证失败" )
146
166
return "" , 0.0
167
+ validation_time = time .time () - validation_start
168
+ logger .info (f"音频文件验证完成,耗时: { validation_time :.3f} 秒" )
147
169
148
170
# 处理音频文件
171
+ audio_process_start = time .time ()
149
172
audio_path = audio_processor .ensure_16k_wav (audio_bytes )
173
+ audio_process_time = time .time () - audio_process_start
174
+ logger .info (f"音频文件处理完成,耗时: { audio_process_time :.3f} 秒" )
150
175
151
176
# 提取声纹特征
177
+ extract_start = time .time ()
178
+ logger .info ("开始提取声纹特征..." )
152
179
test_emb = self .extract_voiceprint (audio_path )
180
+ extract_time = time .time () - extract_start
181
+ logger .info (f"声纹特征提取完成,耗时: { extract_time :.3f} 秒" )
153
182
154
183
# 获取候选声纹特征
184
+ db_query_start = time .time ()
185
+ logger .info ("开始查询数据库获取候选声纹特征..." )
155
186
voiceprints = voiceprint_db .get_voiceprints (speaker_ids )
187
+ db_query_time = time .time () - db_query_start
188
+ logger .info (
189
+ f"数据库查询完成,获取到{ len (voiceprints )} 个声纹特征,耗时: { db_query_time :.3f} 秒"
190
+ )
191
+
156
192
if not voiceprints :
157
193
logger .info ("未找到候选说话人声纹" )
158
194
return "" , 0.0
159
195
160
196
# 计算相似度
197
+ similarity_start = time .time ()
198
+ logger .info ("开始计算相似度..." )
161
199
similarities = {}
162
200
for name , emb in voiceprints .items ():
163
201
similarity = self .calculate_similarity (test_emb , emb )
164
202
similarities [name ] = similarity
203
+ similarity_time = time .time () - similarity_start
204
+ logger .info (
205
+ f"相似度计算完成,共计算{ len (similarities )} 个,耗时: { similarity_time :.3f} 秒"
206
+ )
165
207
166
208
# 找到最佳匹配
167
209
if not similarities :
@@ -172,19 +214,30 @@ def identify_voiceprint(
172
214
173
215
# 检查是否超过阈值
174
216
if match_score < self .similarity_threshold :
175
- logger .info (f"未识别到说话人,最高分: { match_score :.4f} " )
217
+ logger .info (
218
+ f"未识别到说话人,最高分: { match_score :.4f} ,阈值: { self .similarity_threshold } "
219
+ )
220
+ total_time = time .time () - start_time
221
+ logger .info (f"声纹识别流程完成,总耗时: { total_time :.3f} 秒" )
176
222
return "" , match_score
177
223
178
- logger .info (f"识别到说话人: { match_name } , 分数: { match_score :.4f} " )
224
+ total_time = time .time () - start_time
225
+ logger .info (
226
+ f"识别到说话人: { match_name } , 分数: { match_score :.4f} , 总耗时: { total_time :.3f} 秒"
227
+ )
179
228
return match_name , match_score
180
229
181
230
except Exception as e :
182
- logger .error (f"声纹识别异常: { e } " )
231
+ total_time = time .time () - start_time
232
+ logger .error (f"声纹识别异常,总耗时: { total_time :.3f} 秒,错误: { e } " )
183
233
return "" , 0.0
184
234
finally :
185
235
# 清理临时文件
236
+ cleanup_start = time .time ()
186
237
if audio_path :
187
238
audio_processor .cleanup_temp_file (audio_path )
239
+ cleanup_time = time .time () - cleanup_start
240
+ logger .debug (f"临时文件清理完成,耗时: { cleanup_time :.3f} 秒" )
188
241
189
242
def delete_voiceprint (self , speaker_id : str ) -> bool :
190
243
"""
@@ -205,7 +258,18 @@ def get_voiceprint_count(self) -> int:
205
258
Returns:
206
259
int: 声纹总数
207
260
"""
208
- return voiceprint_db .count_voiceprints ()
261
+ start_time = time .time ()
262
+ logger .info ("开始获取声纹总数..." )
263
+
264
+ try :
265
+ count = voiceprint_db .count_voiceprints ()
266
+ total_time = time .time () - start_time
267
+ logger .info (f"声纹总数获取完成: { count } ,耗时: { total_time :.3f} 秒" )
268
+ return count
269
+ except Exception as e :
270
+ total_time = time .time () - start_time
271
+ logger .error (f"获取声纹总数失败,总耗时: { total_time :.3f} 秒,错误: { e } " )
272
+ raise
209
273
210
274
211
275
# 全局声纹服务实例
0 commit comments