添加测试音频文件

openrz · openrz · commit a5baabad9326 · 2025-06-09T18:48:48.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__
+logs
+.voiceprint.yaml
diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2025 xinnan-tech
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
@@ -1,2 +1,64 @@
 # voiceprint-api
-声纹识别api
+
+基于3D-Speaker的声纹识别API服务
+
+## 项目简介
+
+本项目是一个基于FastAPI开发的声纹识别HTTP服务，使用3D-Speaker模型实现声纹识别功能。支持声纹注册和识别功能，并提供完整的API文档。
+
+目前用于[xiaozhi-esp32-server](https://github.com/xinnan-tech/xiaozhi-esp32-server)项目，识别小智设备说话人
+
+## 主要功能
+
+1. 声纹注册
+   - 输入：说话人ID和声音WAV文件
+   - 输出：注册成功状态
+
+2. 声纹识别
+   - 输入：可能的说话人ID列表（逗号分隔）和声音WAV文件
+   - 输出：识别到的说话人ID（未识别则返回空）
+
+## 技术栈
+
+- FastAPI：Web框架
+- 3D-Speaker：声纹识别模型
+- MySQL：数据存储
+
+## 安装说明
+
+1. 克隆项目
+```bash
+git clone https://github.com/xinnan-tech/voiceprint-api.git
+cd voiceprint-api
+```
+
+2. 安装依赖
+```bash
+conda remove -n voiceprint-api --all -y
+conda create -n voiceprint-api python=3.11 -y
+conda activate voiceprint-api
+
+pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+pip install -r requirements.txt
+```
+
+3. 配置数据库
+- 创建数据库
+```
+CREATE DATABASE voiceprint_db CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+```
+- 创建数据表
+```
+CREATE TABLE voiceprints (
+    id INT AUTO_INCREMENT PRIMARY KEY,
+    speaker_id VARCHAR(50) UNIQUE,
+    feature_vector LONGBLOB NOT NULL,
+    INDEX idx_speaker_id (speaker_id)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+```
+- 复制 `voiceprint.yaml` 为 `data/.voiceprint.yaml`
+
+  4. 启动
+```
+python app.py
+```
diff --git a/app.py b/app.py
@@ -0,0 +1,53 @@
+import numpy as np
+import torch
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+# 初始化
+sv_pipeline = pipeline(
+    task=Tasks.speaker_verification, model="iic/speech_campplus_sv_zh-cn_3dspeaker_16k"
+)
+
+voiceprints = {}
+
+
+def _to_numpy(x):
+    return x.cpu().numpy() if torch.is_tensor(x) else np.asarray(x)
+
+
+def register_voiceprint(name, audio_path):
+    """登记声纹特征"""
+    result = sv_pipeline([audio_path], output_emb=True)
+    emb = _to_numpy(result["embs"][0])  # 1 条音频只取第 0 条
+    voiceprints[name] = emb
+    print(f"已登记: {name}")
+
+
+def identify_speaker(audio_path):
+    """识别声纹所属"""
+    test_result = sv_pipeline([audio_path], output_emb=True)
+    test_emb = _to_numpy(test_result["embs"][0])
+
+    similarities = {}
+    for name, emb in voiceprints.items():
+        cos_sim = np.dot(test_emb, emb) / (
+            np.linalg.norm(test_emb) * np.linalg.norm(emb)
+        )
+        similarities[name] = cos_sim
+
+    match_name = max(similarities, key=similarities.get)
+    return match_name, similarities[match_name], similarities
+
+
+if __name__ == "__main__":
+    register_voiceprint("max_output_size", "test//test0.wav")
+    register_voiceprint("tts1", "test//test1.wav")
+
+    test_file = "test//test2.wav"
+    match_name, match_score, all_scores = identify_speaker(test_file)
+
+    print(f"\n识别结果: {test_file} 属于 {match_name}")
+    print(f"匹配分数: {match_score:.4f}")
+    print("\n所有声纹对比分数:")
+    for name, score in all_scores.items():
+        print(f"{name}: {score:.4f}")
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+modelscope==1.11.0
+datasets==2.14.5
+numpy==1.23.5
+packaging==21.3
+addict==2.4.0
+transformers==4.52.4
+torch==2.2.2
+sentencepiece==0.2.0
+soundfile==0.13.1
+torchaudio==2.2.2

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+__pycache__`
	`2`	`+logs`
	`3`	`+.voiceprint.yaml`