Merge pull request #1 from Sakura-RanChen/main

xinnan-tech · web-flow · commit 0855caaa1201 · 2025-06-13T14:48:36.000+08:00
windows和linux下测试无碍
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,3 @@
 __pycache__
 logs
-.voiceprint.yaml
+/data
diff --git a/app.py b/app.py
@@ -1,53 +1,202 @@
+import os
+import yaml
 import numpy as np
 import torch
+from fastapi import FastAPI, File, UploadFile, Form, Header, HTTPException
+from fastapi.responses import JSONResponse
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from db import VoiceprintDB
+import uvicorn
+import logging
+import soundfile as sf
+import librosa
+import tempfile
 
-# 初始化
-sv_pipeline = pipeline(
-    task=Tasks.speaker_verification, model="iic/speech_campplus_sv_zh-cn_3dspeaker_16k"
+# 设置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s"
 )
+logger = logging.getLogger(__name__)
 
-voiceprints = {}
+# 创建临时目录用于存放上传的音频文件
+TMP_DIR = "tmp"
+os.makedirs(TMP_DIR, exist_ok=True)
 
+def load_config():
+    """
+    加载配置文件，优先读取环境变量（适合Docker部署），否则读取本地yaml。
+    """
+    config_path = os.path.join("data", ".voiceprint.yaml")
+    if not os.path.exists(config_path):
+        logger.error("配置文件 data/.voiceprint.yaml 未找到，请先配置。")
+        raise RuntimeError("请先配置 data/.voiceprint.yaml")
+    with open(config_path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
 
-def _to_numpy(x):
-    return x.cpu().numpy() if torch.is_tensor(x) else np.asarray(x)
+try:
+    config = load_config()
+    API_TOKEN = config['server']['token']
+except Exception as e:
+    logger.error(f"配置加载失败: {e}")
+    raise
 
+# 初始化数据库连接
+try:
+    db = VoiceprintDB(config['mysql'])
+    logger.info("数据库连接成功。")
+except Exception as e:
+    logger.error(f"数据库连接失败: {e}")
+    raise
 
-def register_voiceprint(name, audio_path):
-    """登记声纹特征"""
-    result = sv_pipeline([audio_path], output_emb=True)
-    emb = _to_numpy(result["embs"][0])  # 1 条音频只取第 0 条
-    voiceprints[name] = emb
-    print(f"已登记: {name}")
+# 初始化声纹模型（线程安全，建议单进程部署，或用gunicorn单进程模式）
+try:
+    sv_pipeline = pipeline(
+        task=Tasks.speaker_verification, model="iic/speech_campplus_sv_zh-cn_3dspeaker_16k"
+    )
+    logger.info("声纹模型加载成功。")
+except Exception as e:
+    logger.error(f"声纹模型加载失败: {e}")
+    raise
 
+def _to_numpy(x):
+    """
+    将torch tensor或其他类型转为numpy数组
+    """
+    return x.cpu().numpy() if torch.is_tensor(x) else np.asarray(x)
 
-def identify_speaker(audio_path):
-    """识别声纹所属"""
-    test_result = sv_pipeline([audio_path], output_emb=True)
-    test_emb = _to_numpy(test_result["embs"][0])
+app = FastAPI(
+    title="3D-Speaker 声纹识别API",
+    description="基于3D-Speaker的声纹注册与识别服务"
+)
 
-    similarities = {}
-    for name, emb in voiceprints.items():
-        cos_sim = np.dot(test_emb, emb) / (
-            np.linalg.norm(test_emb) * np.linalg.norm(emb)
-        )
-        similarities[name] = cos_sim
+def check_token(token: str = Header(...)):
+    """
+    校验接口令牌
+    """
+    if token != API_TOKEN:
+        logger.warning("无效的接口令牌。")
+        raise HTTPException(status_code=401, detail="无效的接口令牌")
 
-    match_name = max(similarities, key=similarities.get)
-    return match_name, similarities[match_name], similarities
+def ensure_16k_wav(audio_bytes):
+    """
+    将任意采样率的wav bytes转为16kHz wav临时文件，返回文件路径
+    """
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", dir=TMP_DIR) as tmpf:
+        tmpf.write(audio_bytes)
+        tmp_path = tmpf.name
+    # 读取原采样率
+    data, sr = sf.read(tmp_path)
+    if sr != 16000:
+        # librosa重采样，支持多通道
+        if data.ndim == 1:
+            data_rs = librosa.resample(data, orig_sr=sr, target_sr=16000)
+        else:
+            data_rs = np.vstack([librosa.resample(data[:, ch], orig_sr=sr, target_sr=16000) for ch in range(data.shape[1])]).T
+        sf.write(tmp_path, data_rs, 16000)
+    return tmp_path
 
+@app.post("/register", summary="声纹注册")
+async def register(
+    token: str = Header(..., description="接口令牌"),
+    speaker_id: str = Form(..., description="说话人ID"),
+    file: UploadFile = File(..., description="WAV音频文件")
+):
+    """
+    注册声纹接口
+    参数:
+        token: 接口令牌（Header）
+        speaker_id: 说话人ID
+        file: 说话人音频文件（WAV）
+    返回:
+        注册结果
+    """
+    check_token(token)
+    audio_path = None
+    try:
+        audio_bytes = await file.read()
+        audio_path = ensure_16k_wav(audio_bytes)
+        result = sv_pipeline([audio_path], output_emb=True)
+        emb = _to_numpy(result["embs"][0]).astype(np.float32)
+        db.save_voiceprint(speaker_id, emb)
+        logger.info(f"声纹注册成功: {speaker_id}")
+        return {"success": True, "msg": f"已登记: {speaker_id}"}
+    except Exception as e:
+        logger.error(f"声纹注册失败: {e}")
+        raise HTTPException(status_code=500, detail=f"声纹注册失败: {e}")
+    finally:
+        if audio_path and os.path.exists(audio_path):
+            os.remove(audio_path)
 
-if __name__ == "__main__":
-    register_voiceprint("max_output_size", "test//test0.wav")
-    register_voiceprint("tts1", "test//test1.wav")
+@app.post("/identify", summary="声纹识别")
+async def identify(
+    token: str = Header(..., description="接口令牌"),
+    speaker_ids: str = Form(..., description="候选说话人ID，逗号分隔"),
+    file: UploadFile = File(..., description="WAV音频文件")
+):
+    """
+    声纹识别接口
+    参数:
+        token: 接口令牌（Header）
+        speaker_ids: 候选说话人ID，逗号分隔
+        file: 待识别音频文件（WAV）
+    返回:
+        识别结果（说话人ID、相似度分数）
+    """
+    check_token(token)
+    candidate_ids = [x.strip() for x in speaker_ids.split(",") if x.strip()]
+    if not candidate_ids:
+        logger.warning("候选说话人ID不能为空。")
+        raise HTTPException(status_code=400, detail="候选说话人ID不能为空")
+    audio_path = None
+    try:
+        audio_bytes = await file.read()
+        audio_path = ensure_16k_wav(audio_bytes)
+        result = sv_pipeline([audio_path], output_emb=True)
+        test_emb = _to_numpy(result["embs"][0]).astype(np.float32)
+        voiceprints = db.get_voiceprints(candidate_ids)
+        if not voiceprints:
+            logger.info("未找到候选说话人声纹。")
+            return {"speaker_id": "", "score": 0.0}
+        similarities = {
+            name: float(np.dot(test_emb, emb) / (np.linalg.norm(test_emb) * np.linalg.norm(emb)))
+            for name, emb in voiceprints.items()
+        }
+        match_name = max(similarities, key=similarities.get)
+        match_score = similarities[match_name]
+        if match_score < 0.2:
+            logger.info(f"未识别到说话人，最高分: {match_score}")
+            return 
+        logger.info(f"识别到说话人: {match_name}, 分数: {match_score}")
+        return {"speaker_id": match_name, "score": match_score}
+    except Exception as e:
+        logger.error(f"声纹识别失败: {e}")
+        raise HTTPException(status_code=500, detail=f"声纹识别失败: {e}")
+    finally:
+        if audio_path and os.path.exists(audio_path):
+            os.remove(audio_path)
 
-    test_file = "test//test2.wav"
-    match_name, match_score, all_scores = identify_speaker(test_file)
+@app.get("/", include_in_schema=False)
+def root():
+    """
+    根路径，返回服务运行信息
+    """
+    return JSONResponse({"msg": "3D-Speaker voiceprint API service running."})
 
-    print(f"\n识别结果: {test_file} 属于 {match_name}")
-    print(f"匹配分数: {match_score:.4f}")
-    print("\n所有声纹对比分数:")
-    for name, score in all_scores.items():
-        print(f"{name}: {score:.4f}")
+if __name__ == "__main__":
+    try:
+        logger.info(
+            f"服务启动中，监听地址: {config['server']['host']}:{config['server']['port']}，"
+            f"文档: http://{config['server']['host']}:{config['server']['port']}/docs"
+        )
+        print("="*60)
+        print(f"3D-Speaker 声纹API服务已启动，访问: http://{config['server']['host']}:{config['server']['port']}/docs")
+        print("="*60)
+        uvicorn.run(
+            "app:app",
+            host=config['server']['host'],
+            port=config['server']['port'],
+        )
+    except KeyboardInterrupt:
+        logger.info("收到中断信号，正在退出服务。")
diff --git a/db.py b/db.py
@@ -0,0 +1,57 @@
+import pymysql
+import numpy as np
+
+class VoiceprintDB:
+    """
+    声纹数据库操作类，负责声纹特征的存储与读取。
+    """
+
+    def __init__(self, config):
+        """
+        初始化数据库连接。
+
+        :param config: dict，包含数据库连接信息（host, port, user, password, database）
+        """
+        self.conn = pymysql.connect(
+            host=config['host'],
+            port=config['port'],
+            user=config['user'],
+            password=config['password'],
+            database=config['database'],
+            charset='utf8mb4',
+            autocommit=True
+        )
+
+    def save_voiceprint(self, speaker_id, emb):
+        """
+        保存或更新声纹特征。
+
+        :param speaker_id: str，说话人ID
+        :param emb: np.ndarray，声纹特征向量
+        """
+        with self.conn.cursor() as cursor:
+            sql = """
+            INSERT INTO voiceprints (speaker_id, feature_vector)
+            VALUES (%s, %s)
+            ON DUPLICATE KEY UPDATE feature_vector=VALUES(feature_vector)
+            """
+            cursor.execute(sql, (speaker_id, emb.tobytes()))
+
+    def get_voiceprints(self, speaker_ids=None):
+        """
+        获取指定说话人ID的声纹特征（如未指定则获取全部）。
+
+        :param speaker_ids: list[str]，说话人ID列表
+        :return: dict，{speaker_id: np.ndarray}
+        """
+        with self.conn.cursor() as cursor:
+            if speaker_ids:
+                format_strings = ','.join(['%s'] * len(speaker_ids))
+                sql = f"SELECT speaker_id, feature_vector FROM voiceprints WHERE speaker_id IN ({format_strings})"
+                cursor.execute(sql, tuple(speaker_ids))
+            else:
+                sql = "SELECT speaker_id, feature_vector FROM voiceprints"
+                cursor.execute(sql)
+            results = cursor.fetchall()
+            # 将数据库中的二进制特征转为numpy数组
+            return {row[0]: np.frombuffer(row[1], dtype=np.float32) for row in results}
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,10 @@ transformers==4.52.4
 torch==2.2.2
 sentencepiece==0.2.0
 soundfile==0.13.1
-torchaudio==2.2.2
+torchaudio==2.2.2
+pyyaml==6.0.1
+fastapi==0.110.2
+uvicorn==0.29.0
+PyMySQL==1.1.0
+python-multipart==0.0.9
+librosa==0.10.1
diff --git a/test_user.py b/test_user.py
@@ -0,0 +1,23 @@
+import requests
+
+token = "123456"  # 替换为你的真实token
+base_url = "http://192.168.4.82:8000"
+
+# # 注册三个说话人
+# for i in range(3):
+#     wav_path = f"test/test{i}.wav"
+#     speaker_id = f"user_{i}"    
+#     files = {'file': open(wav_path, 'rb')}
+#     data = {'speaker_id': speaker_id}
+#     headers = {'token': token}
+#     resp = requests.post(f"{base_url}/register", files=files, data=data, headers=headers)
+#     print(f"注册 {speaker_id}:", resp.json())
+
+# 声纹识别
+wav_path = "test/test2.wav"
+candidate_ids = "user_0,user_1,user_2" 
+files = {'file': open(wav_path, 'rb')}
+data = {'speaker_ids': candidate_ids}
+headers = {'token': token}
+resp = requests.post(f"{base_url}/identify", files=files, data=data, headers=headers)
+print("识别结果:", resp.json())
diff --git a/voiceprint.yaml b/voiceprint.yaml
@@ -0,0 +1,19 @@
+server:
+  # 服务监听地址，0.0.0.0 表示所有网卡
+  host: 0.0.0.0
+  # 服务监听端口   
+  port: 8004
+  # 接口访问令牌，调用API时需在header中携带          
+  token: "your_api_token"  
+
+mysql:
+  # MySQL数据库主机地址
+  host: "localhost"
+  # 端口  
+  port: 3306
+  # 用户名          
+  user: "root"
+  # 用户密码         
+  password: "your_password"
+  # 数据库名
+  database: "voiceprint_db"

-Original file line number
+Diff line change
@@ @@ -1,3 +1,3 @@ @@
 __pycache__
 logs
 -.voiceprint.yaml
 +/data