* add sensevoice doc

lxowalle · lxowalle · commit 3f798f8ed438 · 2025-12-02T20:31:58.000+08:00
diff --git a/docs/doc/en/audio/recognize.md b/docs/doc/en/audio/recognize.md
@@ -68,6 +68,27 @@ By default, it recognizes Chinese. To recognize English, pass the language param
 whisper = nn.Whisper(model="/root/models/whisper-base/whisper-base.mud", language="en")
 ```
 
+## Using SenseVoice for Speech-to-Text
+
+Currently, only `MaixCAM2` supports `SenseVoice`, and all SenseVoice-related code is implemented in `Python`, so only Python-side examples are provided.
+By default, the system does not include the `SenseVoice` model. Please download it from [here](https://huggingface.co/sipeed/sensevoice-maixcam2) and place it in the `/root/models/` directory.
+
+Before using it, you need to start the `sensevoice.service `service. The command is as follows:
+> Note that sensevoice.service starts from the `/root/models/sensevoice-maixcam2` directory by default, so make sure the model is placed under `/root/models/.`
+```shell
+systemctl start sensevoice.service
+```
+
+You can also start it manually:
+
+```shell
+cd /root/models/sensevoice-maixcam2
+python server.py
+```
+
+After the service is started, you can perform speech recognition via HTTP interaction.
+For usage, please refer to the example:[asr_sensevoice.py](https://github.com/sipeed/MaixPy/tree/main/examples/audio/asr/sensevoice/asr_sensevoice.py)
+
 ## Maix-Speech
 
 [`Maix-Speech`](https://github.com/sipeed/Maix-Speech) is an offline speech recognition library specifically designed for embedded environments. It has been deeply optimized for speech recognition algorithms, significantly reducing memory usage while maintaining excellent recognition accuracy. For detailed information, please refer to the [Maix-Speech Documentation](https://github.com/sipeed/Maix-Speech/blob/master/usage_zh.md).
diff --git a/docs/doc/zh/audio/recognize.md b/docs/doc/zh/audio/recognize.md
@@ -23,6 +23,7 @@ update:
 | ------- | ------- | ----------- | -------- |
 | Whisper | ❌       | ❌           | ✅        |
 | Speech  | ✅       | ✅           | ❌        |
+| Sensevoice  | ❌       | ❌           | ✅       |
 
 ## 使用Whisper做语音转文字
 
@@ -68,6 +69,24 @@ whisper: 开始愉快的探索吧
 whisper = nn.Whisper(model="/root/models/whisper-base/whisper-base.mud", language="en")
 ```
 
+## 使用Sensevoice做语音转文字
+
+目前只有`MaixCAM2`支持`Sensevoice`, 并且`Sensevoice`的相关代码都是`python`实现的, 因此只提供了python端的示例
+系统默认没有`Sensevoice`模型，请从[这里](https://huggingface.co/sipeed/sensevoice-maixcam2)自行下载并放置在`/root/models/`目录下
+
+使用前需要启动`sensevoice.service`服务, 启动的命令如下:
+> 需要注意`sensevoice.service`默认是从`/root/models/sensevoice-maixcam2`目录下启动，因此请一定将模型放置在`/root/models/`目录下
+```shell
+systemctl start sensevoice.service
+```
+你也可以手动启动
+```shell
+cd /root/models/sensevoice-maixcam2
+python server.py
+```
+
+启动服务后可以通过http交互的方法完成语音识别, 使用方法请查看示例：[asr_sensevoice.py](https://github.com/sipeed/MaixPy/tree/main/examples/audio/asr/sensevoice/asr_sensevoice.py)
+
 ## Maix-Speech
 
 [`Maix-Speech`](https://github.com/sipeed/Maix-Speech) 是一款专为嵌入式环境设计的离线语音识别库，针对语音识别算法进行了深度优化，显著降低内存占用，同时在识别准确率方面表现优异。详细说明请参考 [Maix-Speech 使用文档](https://github.com/sipeed/Maix-Speech/blob/master/usage_zh.md)。
diff --git a/examples/audio/asr/sensevoice/asr_sensevoice.py b/examples/audio/asr/sensevoice/asr_sensevoice.py
@@ -0,0 +1,146 @@
+import requests, json, os
+import librosa
+
+class SensevoiceClient:
+    def __init__(self, model = "", url="http://0.0.0.0:12347", lauguage="auto", stream=False):
+        self.model = model
+        self.url = url
+        self.stream = stream
+        self.launguage = lauguage
+    def _check_service(self):
+        try:
+            response = requests.get(self.url + '/status')
+            if response.status_code == 200:
+                return True
+        except:
+            return False
+
+    def _start_service(self):
+        import time
+        if not self._check_service():
+            os.system("systemctl start sensevoice.service")
+
+        while not self._check_service():
+            print("Waiting for service to start...")
+            time.sleep(1)
+
+        return True
+
+    def _stop_service(self):
+        os.system("systemctl stop sensevoice.service")
+
+    def _get_status(self):
+        try:
+            response = requests.get(self.url + '/status')
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return res["status"]
+        except:
+            return "not loaded"
+
+    def _start_model(self):
+        try:
+            data = {
+                "model_path": self.model,
+                "sample_rate": 16000,
+                "language": self.launguage,
+                "stream": self.stream
+            }
+            response = requests.post(self.url + '/start_model', json=data)
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return True if res["status"] == 'loaded' else False
+        except Exception as e:
+            return False
+
+    def _stop_model(self):
+        try:
+            response = requests.post(self.url + '/_stop_model')
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return True if res["status"] == 'not loaded' else False
+        except Exception as e:
+            return False
+
+    def start(self):
+        if self._start_service():
+            print("Service started successfully.")
+        else:
+            print("Failed to start service.")
+            return False
+
+        if self._start_model():
+            print("Model started successfully.")
+        else:
+            print("Failed to start model.")
+            return False
+        return True
+
+    def stop_model(self):
+        self._stop_model()
+
+    def stop(self):
+        self._stop_model()
+        self._stop_service()
+
+    def get_wave_form(self, path):
+        waveform, _ = librosa.load(path, sr=16000)
+        return waveform
+
+    def refer(self, filepath):
+        if self.stream:
+            print("Streaming mode, use refer_stream() instead.")
+            return ""
+        waveform = self.get_wave_form(filepath)
+        data = {
+            "audio_data": waveform.tolist(),
+            "sample_rate": 16000,
+            "launguage": "auto"
+        }
+        try:
+            response = requests.post(self.url + '/asr', json=data)
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return res.get("text", "")
+            else:
+                print(f"Requests failed: {response.status_code}")
+                return ""
+        except Exception as e:
+            print("Requests failed:", e)
+            return ""
+
+    def refer_stream(self, filepath):
+        if not self.stream:
+            print("Streaming mode, use refer() instead.")
+            return ""
+        waveform = self.get_wave_form(filepath)
+        data = {
+            "audio_data": waveform.tolist(),
+            "sample_rate": 16000,
+            "launguage": "auto",
+            "step": 0.1,
+        }
+        print('start post')
+        try:
+            response = requests.post(self.url + '/asr_stream', json=data, stream=True)
+            for line in response.iter_lines():
+                if line:
+                    chunk = json.loads(line)
+                    yield chunk.get("text", "")
+        except Exception as e:
+            print("Requests failed:", e)
+            return ""
+
+stream = True
+client = SensevoiceClient(model="/root/models/sensevoice-maixcam2/model.mud", stream=stream)
+if client.start() is False:
+    print("Failed to start service or model.")
+    exit()
+if not stream:
+    print('start refer')
+    text = client.refer("example/zh.mp3")
+    print(text)
+else:
+    print('start refer stream')
+    for text in client.refer_stream("example/zh.mp3"):
+        print(text)