feat: add inference progress callback support

WEIFENG2333 · claude · happy-otter · WEIFENG2333 · commit ae04109549da · 2026-02-16T16:39:05.000Z
Server stores progress in _progress dict during inference, exposes get_progress RPC method. Client supports progress_callback parameter on Model.infer() with threaded polling. Blocking RPC methods now run in executor so progress queries can be served concurrently. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
diff --git a/src/funasr_server/client.py b/src/funasr_server/client.py
@@ -81,6 +81,7 @@ def infer(
         merge_vad: bool = None,
         merge_length_s: float = None,
         output_timestamp: bool = None,
+        progress_callback=None,
         **kwargs,
     ) -> list:
         """Run inference on this model.
@@ -96,12 +97,13 @@ def infer(
             merge_vad: Merge short VAD segments.
             merge_length_s: Max merge length in seconds.
             output_timestamp: Include timestamps in output.
+            progress_callback: Optional callable(current, total) for progress.
             **kwargs: Additional generate() parameters.
 
         Returns:
             List of result dicts.
         """
-        return self._client.infer(
+        infer_kwargs = dict(
             audio=audio, text=text, audio_bytes=audio_bytes,
             name=self.name,
             language=language, use_itn=use_itn, batch_size=batch_size,
@@ -110,6 +112,34 @@ def infer(
             **kwargs,
         )
 
+        if progress_callback is None:
+            return self._client.infer(**infer_kwargs)
+
+        import threading
+
+        result_box = [None]
+        error_box = [None]
+
+        def _do_infer():
+            try:
+                result_box[0] = self._client.infer(**infer_kwargs)
+            except Exception as e:
+                error_box[0] = e
+
+        t = threading.Thread(target=_do_infer)
+        t.start()
+        while t.is_alive():
+            try:
+                p = self.get_progress()
+                progress_callback(p["current"], p["total"])
+            except Exception:
+                pass
+            t.join(timeout=0.5)
+
+        if error_box[0] is not None:
+            raise error_box[0]
+        return result_box[0]
+
     def transcribe(
         self,
         audio: str = None,
@@ -119,6 +149,10 @@ def transcribe(
         """Transcribe audio — convenience alias for infer()."""
         return self.infer(audio=audio, audio_bytes=audio_bytes, **kwargs)
 
+    def get_progress(self) -> dict:
+        """Get inference progress. Returns {"current": int, "total": int}."""
+        return self._client.get_progress(name=self.name)
+
     def unload(self) -> dict:
         """Unload this model from the server."""
         return self._client.unload_model(name=self.name)
@@ -367,6 +401,10 @@ def unload_model(self, name: str) -> dict:
         """Unload a model by name. Prefer ``model.unload()`` instead."""
         return self._rpc_call("unload_model", {"name": name})
 
+    def get_progress(self, name: str) -> dict:
+        """Get inference progress. Returns {"current": int, "total": int}."""
+        return self._rpc_call("get_progress", {"name": name}, timeout=5)
+
     def infer(
         self,
         name: str,
diff --git a/src/funasr_server/runtime_template/server.py b/src/funasr_server/runtime_template/server.py
@@ -9,6 +9,7 @@
 """
 
 import argparse
+import asyncio
 import base64
 import io
 import json
@@ -32,6 +33,7 @@
 
 _models: dict = {}  # name -> AutoModel instance
 _model_kwargs: dict = {}  # name -> kwargs used to create it
+_progress: dict = {}  # name -> {"current": int, "total": int}
 _exec_globals: dict = {"__builtins__": __builtins__}  # shared exec namespace
 
 
@@ -184,9 +186,16 @@ def rpc_infer(params: dict) -> dict:
     if input_data is None:
         raise ValueError("'input' or 'input_base64' is required")
 
+    def _on_progress(current, total):
+        _progress[name] = {"current": current, "total": total}
+
+    _progress[name] = {"current": 0, "total": 0}
     try:
-        result = model.generate(input=input_data, **generate_kwargs)
+        result = model.generate(
+            input=input_data, progress_callback=_on_progress, **generate_kwargs
+        )
     finally:
+        _progress.pop(name, None)
         if tmp_file and os.path.exists(tmp_file):
             os.unlink(tmp_file)
 
@@ -284,6 +293,19 @@ def rpc_list_models(params: dict) -> dict:
     }
 
 
+def rpc_get_progress(params: dict) -> dict:
+    """Get inference progress for a model.
+
+    Params:
+        name (str): Model cache key (default: "default")
+
+    Returns:
+        {"current": int, "total": int} — 0/0 if not running.
+    """
+    name = params.get("name", "default")
+    return _progress.get(name, {"current": 0, "total": 0})
+
+
 def rpc_shutdown(params: dict) -> dict:
     """Gracefully shut down the server."""
     logger.info("Shutdown requested")
@@ -300,9 +322,14 @@ def rpc_shutdown(params: dict) -> dict:
     "execute": rpc_execute,
     "download_model": rpc_download_model,
     "list_models": rpc_list_models,
+    "get_progress": rpc_get_progress,
     "shutdown": rpc_shutdown,
 }
 
+# Methods that block for a long time and must run in a thread pool
+# so other requests (like get_progress) can be handled concurrently.
+_BLOCKING_METHODS = {"infer", "transcribe", "load_model", "download_model"}
+
 
 # ---------------------------------------------------------------------------
 # HTTP handler
@@ -333,7 +360,11 @@ async def handle_rpc(request: Request):
         return JSONResponse(_error(req_id, -32601, f"Method not found: {method}"))
 
     try:
-        result = handler(params)
+        if method in _BLOCKING_METHODS:
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(None, handler, params)
+        else:
+            result = handler(params)
         return JSONResponse(_ok(req_id, result))
     except Exception as e:
         logger.exception(f"Error in method '{method}'")
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -38,6 +38,8 @@ def do_POST(self):
             result = {"models": {}}
         elif method == "execute":
             result = {"output": "ok", "return_value": None}
+        elif method == "get_progress":
+            result = {"current": 3, "total": 10}
         elif method == "download_model":
             result = {"model": params.get("model"), "path": "/tmp/model", "hub": params.get("hub", "ms")}
         elif method == "error_test":
@@ -417,3 +419,45 @@ def test_rpc_rejects_no_result(client):
 def test_rpc_rejects_malformed_error(client):
     with pytest.raises(ConnectionError, match="Malformed JSON-RPC error"):
         client._rpc_call("malformed_error", {})
+
+
+# ------------------------------------------------------------------
+# Progress
+# ------------------------------------------------------------------
+
+def test_get_progress(client):
+    result = client.get_progress(name="test")
+    assert result["current"] == 3
+    assert result["total"] == 10
+
+
+def test_model_get_progress(client):
+    with patch("funasr_server.client.get_hub", return_value="ms"):
+        model = client.load_model("test-model", name="my_model")
+    result = model.get_progress()
+    assert result["current"] == 3
+    assert result["total"] == 10
+
+
+def test_model_infer_with_progress_callback(client):
+    with patch("funasr_server.client.get_hub", return_value="ms"):
+        model = client.load_model("test-model")
+
+    calls = []
+
+    def on_progress(current, total):
+        calls.append((current, total))
+
+    result = model.infer(audio="test.wav", progress_callback=on_progress)
+    assert result[0]["text"] == "hello world"
+    # progress_callback should have been called at least once
+    assert len(calls) >= 1
+    assert calls[0] == (3, 10)
+
+
+def test_model_infer_without_progress_callback(client):
+    with patch("funasr_server.client.get_hub", return_value="ms"):
+        model = client.load_model("test-model")
+    # Without progress_callback — should work exactly as before
+    result = model.infer(audio="test.wav")
+    assert result[0]["text"] == "hello world"
diff --git a/tests/test_server.py b/tests/test_server.py
@@ -213,7 +213,9 @@ def test_rpc_infer():
 
     result = server.rpc_infer({"input": "test.wav"})
     assert result["results"] == [{"key": "test", "text": "hello"}]
-    mock_model.generate.assert_called_once_with(input="test.wav")
+    call_kwargs = mock_model.generate.call_args[1]
+    assert call_kwargs["input"] == "test.wav"
+    assert "progress_callback" in call_kwargs
 
 
 def test_rpc_infer_by_name():
@@ -233,7 +235,11 @@ def test_rpc_infer_passes_extra_kwargs():
     server._models["default"] = mock_model
 
     server.rpc_infer({"input": "test.wav", "language": "zh", "use_itn": True})
-    mock_model.generate.assert_called_once_with(input="test.wav", language="zh", use_itn=True)
+    call_kwargs = mock_model.generate.call_args[1]
+    assert call_kwargs["input"] == "test.wav"
+    assert call_kwargs["language"] == "zh"
+    assert call_kwargs["use_itn"] is True
+    assert "progress_callback" in call_kwargs
 
 
 def test_rpc_infer_model_not_loaded():
@@ -276,7 +282,9 @@ def test_rpc_transcribe_maps_params():
 
     result = server.rpc_transcribe({"audio": "test.wav"})
     assert result["results"] == [{"key": "test", "text": "hello"}]
-    mock_model.generate.assert_called_once_with(input="test.wav")
+    call_kwargs = mock_model.generate.call_args[1]
+    assert call_kwargs["input"] == "test.wav"
+    assert "progress_callback" in call_kwargs
 
 
 def test_rpc_transcribe_maps_audio_base64():
@@ -411,5 +419,5 @@ def test_rpc_list_models_multiple():
 
 def test_methods_dispatch_table():
     expected = {"health", "load_model", "unload_model", "infer", "transcribe",
-                "execute", "download_model", "list_models", "shutdown"}
+                "execute", "download_model", "list_models", "get_progress", "shutdown"}
     assert expected == set(server._METHODS.keys())