guchang233
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.lock‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/audio/mod.rs‎
Lines changed: 2 additions & 1 deletion b/‎src/audio/mod.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/audio/processor.rs‎
Lines changed: 23 additions & 8 deletions b/‎src/audio/processor.rs‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎src/audio/vad.rs‎
Lines changed: 237 additions & 0 deletions b/‎src/audio/vad.rs‎
Lines changed: 237 additions & 0 deletions
diff --git a/‎src/config.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/config.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/core/state.rs‎
Lines changed: 1 addition & 44 deletions b/‎src/core/state.rs‎
Lines changed: 1 addition & 44 deletions
@@ -1,6 +1,6 @@
 [package]
 name = "voice2type"
-version = "0.0.37"
+version = "0.0.38"
 edition = "2021"
 authors = ["guchang233"]
 
 
@@ -1 +1,2 @@
-pub mod processor;
+pub mod processor;
+pub mod vad;
@@ -7,21 +7,36 @@ use std::io::Cursor;
 pub fn resample_and_convert(input: &[f32], input_rate: u32) -> (Vec<i16>, u32) {
     let target_rate = 16000;
 
+    // 计算增益因子，提高音频音量
+    // 增加增益到3.0以确保API能检测到语音
+    let gain = 3.0;
+    
+    // 如果输入为空，返回空向量
+    if input.is_empty() {
+        return (Vec::new(), target_rate);
+    }
+    
     // 如果原始采样率小于等于目标采样率，不做降采样，直接转换
     if input_rate <= target_rate {
         let output: Vec<i16> = input.iter()
-            .map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
+            .map(|&s| {
+                let amplified = s * gain;
+                (amplified.clamp(-1.0, 1.0) * 32767.0) as i16
+            })
             .collect();
-        return (output, input_rate);
+        return (output, target_rate);
     }
 
     // 计算降采样比率 (简单的整数比率)
     let ratio = (input_rate as f32 / target_rate as f32).round() as usize;
     if ratio <= 1 {
          let output: Vec<i16> = input.iter()
-            .map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
+            .map(|&s| {
+                let amplified = s * gain;
+                (amplified.clamp(-1.0, 1.0) * 32767.0) as i16
+            })
             .collect();
-        return (output, input_rate);
+        return (output, target_rate);
     }
 
     let est_capacity = input.len() / ratio + 1;
@@ -31,13 +46,13 @@ pub fn resample_and_convert(input: &[f32], input_rate: u32) -> (Vec<i16>, u32) {
     for chunk in input.chunks(ratio) {
         let sum: f32 = chunk.iter().sum();
         let avg = sum / chunk.len() as f32;
-        let sample_i16 = (avg.clamp(-1.0, 1.0) * 32767.0) as i16;
+        let amplified = avg * gain;
+        let sample_i16 = (amplified.clamp(-1.0, 1.0) * 32767.0) as i16;
         output.push(sample_i16);
     }
 
-    // 计算实际的新采样率
-    let actual_new_rate = input_rate / ratio as u32;
-    (output, actual_new_rate)
+    // 强制使用目标采样率
+    (output, target_rate)
 }
 
 /// 内存中编码WAV数据
 
@@ -0,0 +1,237 @@
+use std::collections::VecDeque;
+
+/// VAD（语音活动检测）模块
+/// 用于检测音频中的语音活动和非活动时段
+#[derive(Debug, Clone)]
+pub struct VoiceActivityDetector {
+    // 能量阈值，用于判断是否有语音活动
+    energy_threshold: f32,
+    // 静音帧阈值，用于判断是否为静默
+    silence_frame_threshold: usize,
+    // 语音帧阈值，用于判断是否为语音
+    speech_frame_threshold: usize,
+    // 帧大小（样本数）
+    frame_size: usize,
+    // 采样率
+    sample_rate: u32,
+    // 能量历史
+    energy_history: VecDeque<f32>,
+    // 当前状态
+    is_speech: bool,
+    // 连续语音帧计数
+    speech_frame_count: usize,
+    // 连续静默帧计数
+    silence_frame_count: usize,
+}
+
+impl Default for VoiceActivityDetector {
+    fn default() -> Self {
+        Self {
+            energy_threshold: 0.01,
+            silence_frame_threshold: 10,
+            speech_frame_threshold: 3,
+            frame_size: 1024,
+            sample_rate: 16000,
+            energy_history: VecDeque::with_capacity(30), // 保存30帧的能量历史
+            is_speech: false,
+            speech_frame_count: 0,
+            silence_frame_count: 0,
+        }
+    }
+}
+
+impl VoiceActivityDetector {
+    /// 创建新的VAD实例
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// 设置能量阈值
+    pub fn set_energy_threshold(&mut self, threshold: f32) {
+        self.energy_threshold = threshold;
+    }
+
+    /// 设置静音帧阈值
+    pub fn set_silence_frame_threshold(&mut self, threshold: usize) {
+        self.silence_frame_threshold = threshold;
+    }
+
+    /// 设置语音帧阈值
+    pub fn set_speech_frame_threshold(&mut self, threshold: usize) {
+        self.speech_frame_threshold = threshold;
+    }
+
+    /// 设置帧大小
+    pub fn set_frame_size(&mut self, frame_size: usize) {
+        self.frame_size = frame_size;
+    }
+
+    /// 设置采样率
+    pub fn set_sample_rate(&mut self, sample_rate: u32) {
+        self.sample_rate = sample_rate;
+    }
+
+    /// 计算音频帧的能量
+    pub fn calculate_frame_energy(&self, frame: &[f32]) -> f32 {
+        if frame.is_empty() {
+            return 0.0;
+        }
+
+        let sum: f32 = frame.iter().map(|&x| x * x).sum();
+        sum / frame.len() as f32
+    }
+
+    /// 处理一帧音频数据
+    pub fn process_frame(&mut self, frame: &[f32]) -> bool {
+        // 计算当前帧的能量
+        let energy = self.calculate_frame_energy(frame);
+        
+        // 更新能量历史
+        self.energy_history.push_back(energy);
+        if self.energy_history.len() > 30 {
+            self.energy_history.pop_front();
+        }
+
+        // 判断当前帧是否为语音
+        let is_current_speech = energy > self.energy_threshold;
+
+        if is_current_speech {
+            // 重置静默帧计数
+            self.silence_frame_count = 0;
+            // 增加语音帧计数
+            self.speech_frame_count += 1;
+
+            // 如果连续语音帧达到阈值，切换到语音状态
+            if !self.is_speech && self.speech_frame_count >= self.speech_frame_threshold {
+                self.is_speech = true;
+            }
+        } else {
+            // 重置语音帧计数
+            self.speech_frame_count = 0;
+            // 增加静默帧计数
+            self.silence_frame_count += 1;
+
+            // 如果连续静默帧达到阈值，切换到静默状态
+            if self.is_speech && self.silence_frame_count >= self.silence_frame_threshold {
+                self.is_speech = false;
+            }
+        }
+
+        self.is_speech
+    }
+
+    /// 检查是否检测到语音活动
+    pub fn is_speech(&self) -> bool {
+        self.is_speech
+    }
+
+    /// 重置VAD状态
+    pub fn reset(&mut self) {
+        self.energy_history.clear();
+        self.is_speech = false;
+        self.speech_frame_count = 0;
+        self.silence_frame_count = 0;
+    }
+
+    /// 获取当前的能量历史
+    pub fn energy_history(&self) -> &VecDeque<f32> {
+        &self.energy_history
+    }
+}
+
+/// 音频分片器
+/// 用于根据VAD结果将音频分割成合适的片段
+#[derive(Debug, Clone)]
+pub struct AudioSegmenter {
+    vad: VoiceActivityDetector,
+    // 最大片段长度（样本数）
+    max_segment_length: usize,
+    // 当前片段
+    current_segment: Vec<f32>,
+    // 采样率
+    sample_rate: u32,
+}
+
+impl Default for AudioSegmenter {
+    fn default() -> Self {
+        Self {
+            vad: VoiceActivityDetector::default(),
+            max_segment_length: 16000 * 5, // 5秒音频 @ 16kHz
+            current_segment: Vec::new(),
+            sample_rate: 16000,
+        }
+    }
+}
+
+impl AudioSegmenter {
+    /// 创建新的音频分片器
+    pub fn new(sample_rate: u32) -> Self {
+        let mut segmenter = Self::default();
+        segmenter.sample_rate = sample_rate;
+        segmenter.vad.set_sample_rate(sample_rate);
+        segmenter
+    }
+
+    /// 设置VAD参数
+    pub fn set_vad_params(
+        &mut self,
+        energy_threshold: f32,
+        silence_frame_threshold: usize,
+        speech_frame_threshold: usize,
+        frame_size: usize,
+    ) {
+        self.vad.set_energy_threshold(energy_threshold);
+        self.vad.set_silence_frame_threshold(silence_frame_threshold);
+        self.vad.set_speech_frame_threshold(speech_frame_threshold);
+        self.vad.set_frame_size(frame_size);
+    }
+
+    /// 设置最大片段长度
+    pub fn set_max_segment_length(&mut self, max_length_ms: u64) {
+        let max_length_samples = (max_length_ms as f32 * self.sample_rate as f32 / 1000.0) as usize;
+        self.max_segment_length = max_length_samples;
+    }
+
+    /// 处理音频数据
+    /// 返回是否需要分片
+    pub fn process_audio(&mut self, audio_data: &[f32]) -> bool {
+        // 将音频数据分帧处理
+        for frame in audio_data.chunks(self.vad.frame_size) {
+            // 处理当前帧
+            let is_speech = self.vad.process_frame(frame);
+
+            // 将当前帧添加到当前片段
+            self.current_segment.extend_from_slice(frame);
+
+            // 检查是否需要分片
+            // 1. 如果片段长度超过最大值
+            // 2. 如果检测到静默且片段不为空
+            if self.current_segment.len() > self.max_segment_length || (!is_speech && !self.current_segment.is_empty()) {
+                return true;
+            }
+        }
+
+        false
+    }
+
+    /// 获取当前片段
+    pub fn get_current_segment(&self) -> &[f32] {
+        &self.current_segment
+    }
+
+    /// 取出当前片段
+    pub fn take_current_segment(&mut self) -> Vec<f32> {
+        std::mem::take(&mut self.current_segment)
+    }
+
+    /// 重置分片器
+    pub fn reset(&mut self) {
+        self.vad.reset();
+        self.current_segment.clear();
+    }
+
+    /// 检查是否有语音活动
+    pub fn is_speech(&self) -> bool {
+        self.vad.is_speech()
+    }
+}
@@ -56,7 +56,7 @@ impl Default for AppConfig {
             ignored_version: String::new(),
             output_language: "auto".to_string(),
             enable_streaming: false, // 默认关闭流式输出
-            streaming_interval: 2000, // 默认 2000 毫秒
+            streaming_interval: 500, // 默认 500 毫秒
             trigger_mode: "hold".to_string(), // 默认按住输入模式
             speech_service: "siliconflow".to_string(), // 默认使用SiliconFlow语音识别服务
             // 指示器默认配置
 
@@ -3,49 +3,6 @@
 pub enum AppState {
     Idle,
     Recording,
-    Streaming, // 新增：流式处理状态
     Processing,
-    Cancelled, // 新增：取消状态
-}
-
-/// 流式处理状态
-#[derive(Debug, Clone)]
-pub struct StreamingState {
-    pub current_chunk: Vec<f32>,
-    pub chunk_count: usize,
-    pub last_result: String,
-    pub output_sent: bool,
-    // API调用节流相关字段
-    pub processing: bool, // 是否有正在进行的API调用
-}
-
-impl StreamingState {
-    /// 创建新的流式处理状态
-    pub fn new() -> Self {
-        // 合理设置初始容量，避免过度分配
-        let initial_chunk_capacity = 16000; // 1秒音频 @ 16kHz
-        
-        Self {
-            current_chunk: Vec::with_capacity(initial_chunk_capacity),
-            chunk_count: 0,
-            last_result: String::new(),
-            output_sent: false,
-            // API调用节流默认参数
-            processing: false,
-        }
-    }
-    
-    /// 更新最后结果
-    pub fn update_last_result(&mut self, result: &str) {
-        self.last_result = result.to_string();
-    }
-    
-    /// 重置流式处理状态
-    pub fn reset(&mut self) {
-        self.current_chunk.clear();
-        self.chunk_count = 0;
-        self.last_result.clear();
-        self.output_sent = false;
-        self.processing = false;
-    }
+    Cancelled, // 取消状态
 }
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-pub mod processor;`
	`1`	`+pub mod processor;`
	`2`	`+pub mod vad;`