Skip to content

Commit d0f4cbb

Browse files
committed
release: v0.0.38
- 新增后端适配层架构,支持 SiliconFlow, OpenAI, Google Cloud 等多种服务商 - 实现基于 TCP 的 API 服务器 - 优化音频处理逻辑与识别准确度 - 更新更新日志与版本号
1 parent 653d6cd commit d0f4cbb

File tree

11 files changed

+484
-243
lines changed

11 files changed

+484
-243
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "voice2type"
3-
version = "0.0.37"
3+
version = "0.0.38"
44
edition = "2021"
55
authors = ["guchang233"]
66

src/audio/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
pub mod processor;
1+
pub mod processor;
2+
pub mod vad;

src/audio/processor.rs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,36 @@ use std::io::Cursor;
77
pub fn resample_and_convert(input: &[f32], input_rate: u32) -> (Vec<i16>, u32) {
88
let target_rate = 16000;
99

10+
// 计算增益因子,提高音频音量
11+
// 增加增益到3.0以确保API能检测到语音
12+
let gain = 3.0;
13+
14+
// 如果输入为空,返回空向量
15+
if input.is_empty() {
16+
return (Vec::new(), target_rate);
17+
}
18+
1019
// 如果原始采样率小于等于目标采样率,不做降采样,直接转换
1120
if input_rate <= target_rate {
1221
let output: Vec<i16> = input.iter()
13-
.map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
22+
.map(|&s| {
23+
let amplified = s * gain;
24+
(amplified.clamp(-1.0, 1.0) * 32767.0) as i16
25+
})
1426
.collect();
15-
return (output, input_rate);
27+
return (output, target_rate);
1628
}
1729

1830
// 计算降采样比率 (简单的整数比率)
1931
let ratio = (input_rate as f32 / target_rate as f32).round() as usize;
2032
if ratio <= 1 {
2133
let output: Vec<i16> = input.iter()
22-
.map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
34+
.map(|&s| {
35+
let amplified = s * gain;
36+
(amplified.clamp(-1.0, 1.0) * 32767.0) as i16
37+
})
2338
.collect();
24-
return (output, input_rate);
39+
return (output, target_rate);
2540
}
2641

2742
let est_capacity = input.len() / ratio + 1;
@@ -31,13 +46,13 @@ pub fn resample_and_convert(input: &[f32], input_rate: u32) -> (Vec<i16>, u32) {
3146
for chunk in input.chunks(ratio) {
3247
let sum: f32 = chunk.iter().sum();
3348
let avg = sum / chunk.len() as f32;
34-
let sample_i16 = (avg.clamp(-1.0, 1.0) * 32767.0) as i16;
49+
let amplified = avg * gain;
50+
let sample_i16 = (amplified.clamp(-1.0, 1.0) * 32767.0) as i16;
3551
output.push(sample_i16);
3652
}
3753

38-
// 计算实际的新采样率
39-
let actual_new_rate = input_rate / ratio as u32;
40-
(output, actual_new_rate)
54+
// 强制使用目标采样率
55+
(output, target_rate)
4156
}
4257

4358
/// 内存中编码WAV数据

src/audio/vad.rs

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
use std::collections::VecDeque;
2+
3+
/// VAD(语音活动检测)模块
4+
/// 用于检测音频中的语音活动和非活动时段
5+
#[derive(Debug, Clone)]
6+
pub struct VoiceActivityDetector {
7+
// 能量阈值,用于判断是否有语音活动
8+
energy_threshold: f32,
9+
// 静音帧阈值,用于判断是否为静默
10+
silence_frame_threshold: usize,
11+
// 语音帧阈值,用于判断是否为语音
12+
speech_frame_threshold: usize,
13+
// 帧大小(样本数)
14+
frame_size: usize,
15+
// 采样率
16+
sample_rate: u32,
17+
// 能量历史
18+
energy_history: VecDeque<f32>,
19+
// 当前状态
20+
is_speech: bool,
21+
// 连续语音帧计数
22+
speech_frame_count: usize,
23+
// 连续静默帧计数
24+
silence_frame_count: usize,
25+
}
26+
27+
impl Default for VoiceActivityDetector {
28+
fn default() -> Self {
29+
Self {
30+
energy_threshold: 0.01,
31+
silence_frame_threshold: 10,
32+
speech_frame_threshold: 3,
33+
frame_size: 1024,
34+
sample_rate: 16000,
35+
energy_history: VecDeque::with_capacity(30), // 保存30帧的能量历史
36+
is_speech: false,
37+
speech_frame_count: 0,
38+
silence_frame_count: 0,
39+
}
40+
}
41+
}
42+
43+
impl VoiceActivityDetector {
44+
/// 创建新的VAD实例
45+
pub fn new() -> Self {
46+
Default::default()
47+
}
48+
49+
/// 设置能量阈值
50+
pub fn set_energy_threshold(&mut self, threshold: f32) {
51+
self.energy_threshold = threshold;
52+
}
53+
54+
/// 设置静音帧阈值
55+
pub fn set_silence_frame_threshold(&mut self, threshold: usize) {
56+
self.silence_frame_threshold = threshold;
57+
}
58+
59+
/// 设置语音帧阈值
60+
pub fn set_speech_frame_threshold(&mut self, threshold: usize) {
61+
self.speech_frame_threshold = threshold;
62+
}
63+
64+
/// 设置帧大小
65+
pub fn set_frame_size(&mut self, frame_size: usize) {
66+
self.frame_size = frame_size;
67+
}
68+
69+
/// 设置采样率
70+
pub fn set_sample_rate(&mut self, sample_rate: u32) {
71+
self.sample_rate = sample_rate;
72+
}
73+
74+
/// 计算音频帧的能量
75+
pub fn calculate_frame_energy(&self, frame: &[f32]) -> f32 {
76+
if frame.is_empty() {
77+
return 0.0;
78+
}
79+
80+
let sum: f32 = frame.iter().map(|&x| x * x).sum();
81+
sum / frame.len() as f32
82+
}
83+
84+
/// 处理一帧音频数据
85+
pub fn process_frame(&mut self, frame: &[f32]) -> bool {
86+
// 计算当前帧的能量
87+
let energy = self.calculate_frame_energy(frame);
88+
89+
// 更新能量历史
90+
self.energy_history.push_back(energy);
91+
if self.energy_history.len() > 30 {
92+
self.energy_history.pop_front();
93+
}
94+
95+
// 判断当前帧是否为语音
96+
let is_current_speech = energy > self.energy_threshold;
97+
98+
if is_current_speech {
99+
// 重置静默帧计数
100+
self.silence_frame_count = 0;
101+
// 增加语音帧计数
102+
self.speech_frame_count += 1;
103+
104+
// 如果连续语音帧达到阈值,切换到语音状态
105+
if !self.is_speech && self.speech_frame_count >= self.speech_frame_threshold {
106+
self.is_speech = true;
107+
}
108+
} else {
109+
// 重置语音帧计数
110+
self.speech_frame_count = 0;
111+
// 增加静默帧计数
112+
self.silence_frame_count += 1;
113+
114+
// 如果连续静默帧达到阈值,切换到静默状态
115+
if self.is_speech && self.silence_frame_count >= self.silence_frame_threshold {
116+
self.is_speech = false;
117+
}
118+
}
119+
120+
self.is_speech
121+
}
122+
123+
/// 检查是否检测到语音活动
124+
pub fn is_speech(&self) -> bool {
125+
self.is_speech
126+
}
127+
128+
/// 重置VAD状态
129+
pub fn reset(&mut self) {
130+
self.energy_history.clear();
131+
self.is_speech = false;
132+
self.speech_frame_count = 0;
133+
self.silence_frame_count = 0;
134+
}
135+
136+
/// 获取当前的能量历史
137+
pub fn energy_history(&self) -> &VecDeque<f32> {
138+
&self.energy_history
139+
}
140+
}
141+
142+
/// 音频分片器
143+
/// 用于根据VAD结果将音频分割成合适的片段
144+
#[derive(Debug, Clone)]
145+
pub struct AudioSegmenter {
146+
vad: VoiceActivityDetector,
147+
// 最大片段长度(样本数)
148+
max_segment_length: usize,
149+
// 当前片段
150+
current_segment: Vec<f32>,
151+
// 采样率
152+
sample_rate: u32,
153+
}
154+
155+
impl Default for AudioSegmenter {
156+
fn default() -> Self {
157+
Self {
158+
vad: VoiceActivityDetector::default(),
159+
max_segment_length: 16000 * 5, // 5秒音频 @ 16kHz
160+
current_segment: Vec::new(),
161+
sample_rate: 16000,
162+
}
163+
}
164+
}
165+
166+
impl AudioSegmenter {
167+
/// 创建新的音频分片器
168+
pub fn new(sample_rate: u32) -> Self {
169+
let mut segmenter = Self::default();
170+
segmenter.sample_rate = sample_rate;
171+
segmenter.vad.set_sample_rate(sample_rate);
172+
segmenter
173+
}
174+
175+
/// 设置VAD参数
176+
pub fn set_vad_params(
177+
&mut self,
178+
energy_threshold: f32,
179+
silence_frame_threshold: usize,
180+
speech_frame_threshold: usize,
181+
frame_size: usize,
182+
) {
183+
self.vad.set_energy_threshold(energy_threshold);
184+
self.vad.set_silence_frame_threshold(silence_frame_threshold);
185+
self.vad.set_speech_frame_threshold(speech_frame_threshold);
186+
self.vad.set_frame_size(frame_size);
187+
}
188+
189+
/// 设置最大片段长度
190+
pub fn set_max_segment_length(&mut self, max_length_ms: u64) {
191+
let max_length_samples = (max_length_ms as f32 * self.sample_rate as f32 / 1000.0) as usize;
192+
self.max_segment_length = max_length_samples;
193+
}
194+
195+
/// 处理音频数据
196+
/// 返回是否需要分片
197+
pub fn process_audio(&mut self, audio_data: &[f32]) -> bool {
198+
// 将音频数据分帧处理
199+
for frame in audio_data.chunks(self.vad.frame_size) {
200+
// 处理当前帧
201+
let is_speech = self.vad.process_frame(frame);
202+
203+
// 将当前帧添加到当前片段
204+
self.current_segment.extend_from_slice(frame);
205+
206+
// 检查是否需要分片
207+
// 1. 如果片段长度超过最大值
208+
// 2. 如果检测到静默且片段不为空
209+
if self.current_segment.len() > self.max_segment_length || (!is_speech && !self.current_segment.is_empty()) {
210+
return true;
211+
}
212+
}
213+
214+
false
215+
}
216+
217+
/// 获取当前片段
218+
pub fn get_current_segment(&self) -> &[f32] {
219+
&self.current_segment
220+
}
221+
222+
/// 取出当前片段
223+
pub fn take_current_segment(&mut self) -> Vec<f32> {
224+
std::mem::take(&mut self.current_segment)
225+
}
226+
227+
/// 重置分片器
228+
pub fn reset(&mut self) {
229+
self.vad.reset();
230+
self.current_segment.clear();
231+
}
232+
233+
/// 检查是否有语音活动
234+
pub fn is_speech(&self) -> bool {
235+
self.vad.is_speech()
236+
}
237+
}

src/config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ impl Default for AppConfig {
5656
ignored_version: String::new(),
5757
output_language: "auto".to_string(),
5858
enable_streaming: false, // 默认关闭流式输出
59-
streaming_interval: 2000, // 默认 2000 毫秒
59+
streaming_interval: 500, // 默认 500 毫秒
6060
trigger_mode: "hold".to_string(), // 默认按住输入模式
6161
speech_service: "siliconflow".to_string(), // 默认使用SiliconFlow语音识别服务
6262
// 指示器默认配置

src/core/state.rs

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,49 +3,6 @@
33
pub enum AppState {
44
Idle,
55
Recording,
6-
Streaming, // 新增:流式处理状态
76
Processing,
8-
Cancelled, // 新增:取消状态
9-
}
10-
11-
/// 流式处理状态
12-
#[derive(Debug, Clone)]
13-
pub struct StreamingState {
14-
pub current_chunk: Vec<f32>,
15-
pub chunk_count: usize,
16-
pub last_result: String,
17-
pub output_sent: bool,
18-
// API调用节流相关字段
19-
pub processing: bool, // 是否有正在进行的API调用
20-
}
21-
22-
impl StreamingState {
23-
/// 创建新的流式处理状态
24-
pub fn new() -> Self {
25-
// 合理设置初始容量,避免过度分配
26-
let initial_chunk_capacity = 16000; // 1秒音频 @ 16kHz
27-
28-
Self {
29-
current_chunk: Vec::with_capacity(initial_chunk_capacity),
30-
chunk_count: 0,
31-
last_result: String::new(),
32-
output_sent: false,
33-
// API调用节流默认参数
34-
processing: false,
35-
}
36-
}
37-
38-
/// 更新最后结果
39-
pub fn update_last_result(&mut self, result: &str) {
40-
self.last_result = result.to_string();
41-
}
42-
43-
/// 重置流式处理状态
44-
pub fn reset(&mut self) {
45-
self.current_chunk.clear();
46-
self.chunk_count = 0;
47-
self.last_result.clear();
48-
self.output_sent = false;
49-
self.processing = false;
50-
}
7+
Cancelled, // 取消状态
518
}

0 commit comments

Comments
 (0)