QwenLM · YuYun329 · Mar 7, 2026 · Mar 7, 2026
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 <p>
 
 <p align="center">
-&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-asr">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-ASR">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3asr">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.21337">Paper</a>&nbsp&nbsp
+&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-asr">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-ASR">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3asr">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-ASR/blob/main/assets/Qwen3_ASR.pdf">Paper</a>&nbsp&nbsp
 <br>
 🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-ASR">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-ASR">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-speech-recognition">API</a>
 
@@ -41,7 +41,7 @@ We release **Qwen3-ASR**, a family that includes two powerful all-in-one speech
 - [Fine Tuning](#fine-tuning)
 - [Docker](#docker)
 - [Evaluation](#evaluation)
-- [Citation](#citation)
+<!-- - [Citation](#citation) -->
 
 
 ## Overview
@@ -1420,18 +1420,18 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
 </details>
 
 
-## Citation
+<!-- ## Citation
 
 If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
 
 ```BibTeX
 @article{Qwen3-ASR,
   title={Qwen3-ASR Technical Report},
-  author={Xian Shi, Xiong Wang, Zhifang Guo, Yongqi Wang, Pei Zhang, Xinyu Zhang, Zishan Guo, Hongkun Hao, Yu Xi, Baosong Yang, Jin Xu, Jingren Zhou, Junyang Lin},
-  journal={arXiv preprint arXiv:2601.21337},
+  author={},
+  journal={arXiv preprint arXiv:},
   year={2026}
 }
-```
+``` -->
 
 
 ## Star History

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "qwen-asr"
-version = "0.0.6"
+version = "0.0.4"
 description = "Qwen-ASR python package"
 readme = "README.md"
 requires-python = ">=3.9"

diff --git a/qwen_asr/cli/demo.py b/qwen_asr/cli/demo.py
@@ -144,6 +144,12 @@ def build_parser() -> argparse.ArgumentParser:
         help="Qwen3-ForcedAligner checkpoint path or HF repo id (optional; enables timestamps when provided).",
     )
 
+    parser.add_argument(
+        "--campplus-model",
+        default="FunAudioLLM/Fun-CosyVoice3-0.5B-2512/campplus.onnx",
+        help="Campplus model path for speaker diarization (optional).",
+    )
+
     parser.add_argument(
         "--backend",
         default="transformers",
@@ -306,6 +312,7 @@ def _make_timestamp_html(audio_upload: Any, timestamps: Any) -> str:
         word = str(item.get("text", "") or "")
         start = item.get("start_time", None)
         end = item.get("end_time", None)
+        speaker = item.get("speaker", None)
         if start is None or end is None:
             continue
 
@@ -328,13 +335,17 @@ def _make_timestamp_html(audio_upload: Any, timestamps: Any) -> str:
         b64 = base64.b64encode(mem.read()).decode("utf-8")
         audio_src = f"data:audio/wav;base64,{b64}"
 
+        speaker_label = f"Speaker {speaker}" if speaker is not None else ""
+        speaker_style = f"border-left: 4px solid hsl({(speaker % 12) * 30}, 70%, 50%);" if speaker is not None else ""
+
         html_content += f"""
-        <div class="word-box">
+        <div class="word-box" style="{speaker_style}">
             <div class="word-text">{word}</div>
-            <div class="word-time">{start} - {end} s</div>
+            <div class="word-time">{start:.2f} - {end:.2f} s</div>
             <div class="word-audio">
                 <audio controls preload="none" src="{audio_src}"></audio>
             </div>
+            {f'<div style="font-size: 11px; color: #888; margin-top: 4px;">{speaker_label}</div>' if speaker_label else ''}
         </div>
         """
 
@@ -438,6 +449,7 @@ def run(audio_upload: Any, lang_disp: str, return_ts: bool):
                             text=getattr(t, "text", None),
                             start_time=getattr(t, "start_time", None),
                             end_time=getattr(t, "end_time", None),
+                            speaker=getattr(t, "speaker", None),
                         )
                         for t in (getattr(r, "time_stamps", None) or [])
                     ]
@@ -490,6 +502,10 @@ def main(argv=None) -> int:
     user_backend_kwargs = _parse_json_dict(args.backend_kwargs, name="--backend-kwargs")
     user_aligner_kwargs = _parse_json_dict(args.aligner_kwargs, name="--aligner-kwargs")
 
+    # Add campplus model path to aligner kwargs
+    if hasattr(args, "campplus_model") and args.campplus_model:
+        user_aligner_kwargs["campplus_model"] = args.campplus_model
+
     backend_kwargs = _merge_dicts(_default_backend_kwargs(backend), user_backend_kwargs)
     backend_kwargs = _coerce_special_types(backend_kwargs)
 

diff --git a/qwen_asr/inference/cluster_backend.py b/qwen_asr/inference/cluster_backend.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+# Modified from 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker)
+
+import scipy
+import torch
+import sklearn
+import numpy as np
+
+from sklearn.cluster._kmeans import k_means
+from sklearn.cluster import HDBSCAN
+
+
+class SpectralCluster:
+    r"""A spectral clustering mehtod using unnormalized Laplacian of affinity matrix.
+    This implementation is adapted from https://github.com/speechbrain/speechbrain.
+    """
+
+    def __init__(self, min_num_spks=1, max_num_spks=15, pval=0.022):
+        self.min_num_spks = min_num_spks
+        self.max_num_spks = max_num_spks
+        self.pval = pval
+
+    def __call__(self, X, oracle_num=None):
+        # Similarity matrix computation
+        sim_mat = self.get_sim_mat(X)
+
+        # Refining similarity matrix with pval
+        prunned_sim_mat = self.p_pruning(sim_mat)
+
+        # Symmetrization
+        sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T)
+
+        # Laplacian calculation
+        laplacian = self.get_laplacian(sym_prund_sim_mat)
+
+        # Get Spectral Embeddings
+        emb, num_of_spk = self.get_spec_embs(laplacian, oracle_num)
+
+        # Perform clustering
+        labels = self.cluster_embs(emb, num_of_spk)
+
+        return labels
+
+    def get_sim_mat(self, X):
+        # Cosine similarities
+        M = sklearn.metrics.pairwise.cosine_similarity(X, X)
+        return M
+
+    def p_pruning(self, A):
+        if A.shape[0] * self.pval < 6:
+            pval = 6.0 / A.shape[0]
+        else:
+            pval = self.pval
+
+        n_elems = int((1 - pval) * A.shape[0])
+
+        # For each row in a affinity matrix
+        for i in range(A.shape[0]):
+            low_indexes = np.argsort(A[i, :])
+            low_indexes = low_indexes[0:n_elems]
+
+            # Replace smaller similarity values by 0s
+            A[i, low_indexes] = 0
+        return A
+
+    def get_laplacian(self, M):
+        M[np.diag_indices(M.shape[0])] = 0
+        D = np.sum(np.abs(M), axis=1)
+        D = np.diag(D)
+        L = D - M
+        return L
+
+    def get_spec_embs(self, L, k_oracle=None):
+        lambdas, eig_vecs = scipy.linalg.eigh(L)
+
+        if k_oracle is not None:
+            num_of_spk = k_oracle
+        else:
+            lambda_gap_list = self.getEigenGaps(
+                lambdas[self.min_num_spks - 1 : self.max_num_spks + 1]
+            )
+            num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks
+
+        emb = eig_vecs[:, :num_of_spk]
+        return emb, num_of_spk
+
+    def cluster_embs(self, emb, k):
+        _, labels, _ = k_means(emb, k)
+        return labels
+
+    def getEigenGaps(self, eig_vals):
+        eig_vals_gap_list = []
+        for i in range(len(eig_vals) - 1):
+            gap = float(eig_vals[i + 1]) - float(eig_vals[i])
+            eig_vals_gap_list.append(gap)
+        return eig_vals_gap_list
+
+
+class UmapHdbscan:
+    r"""
+    Reference:
+    - Siqi Zheng, Hongbin Suo. Reformulating Speaker Diarization as Community Detection With
+      Emphasis On Topological Structure. ICASSP2022
+    """
+
+    def __init__(
+        self, n_neighbors=20, n_components=60, min_samples=10, min_cluster_size=10, metric="cosine"
+    ):
+        self.n_neighbors = n_neighbors
+        self.n_components = n_components
+        self.min_samples = min_samples
+        self.min_cluster_size = min_cluster_size
+        self.metric = metric
+
+    def __call__(self, X):
+        import umap.umap_ as umap
+
+        umap_X = umap.UMAP(
+            n_neighbors=self.n_neighbors,
+            min_dist=0.0,
+            n_components=min(self.n_components, X.shape[0] - 2),
+            metric=self.metric,
+        ).fit_transform(X)
+        labels = HDBSCAN(
+            min_samples=self.min_samples,
+            min_cluster_size=self.min_cluster_size,
+            allow_single_cluster=True,
+        ).fit_predict(umap_X)
+        return labels
+
+
+class ClusterBackend(torch.nn.Module):
+    r"""Perfom clustering for input embeddings and output the labels.
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, merge_thr=0.78):
+        super().__init__()
+        self.model_config = {"merge_thr": merge_thr}
+        # self.other_config = kwargs
+
+        self.spectral_cluster = SpectralCluster()
+        self.umap_hdbscan_cluster = UmapHdbscan()
+
+    def forward(self, X, **params):
+        # clustering and return the labels
+        k = params["oracle_num"] if "oracle_num" in params else None
+        assert len(X.shape) == 2, "modelscope error: the shape of input should be [N, C]"
+        if X.shape[0] < 20:
+            return np.zeros(X.shape[0], dtype="int")
+        if X.shape[0] < 2048 or k is not None:
+            # default
+            # unexpected corner case
+            labels = self.spectral_cluster(X, k)
+        else:
+            labels = self.umap_hdbscan_cluster(X)
+
+        if k is None and "merge_thr" in self.model_config:
+            labels = self.merge_by_cos(labels, X, self.model_config["merge_thr"])
+
+        return labels
+
+    def merge_by_cos(self, labels, embs, cos_thr):
+        # merge the similar speakers by cosine similarity
+        assert cos_thr > 0 and cos_thr <= 1
+        while True:
+            spk_num = labels.max() + 1
+            if spk_num == 1:
+                break
+            spk_center = []
+            for i in range(spk_num):
+                spk_emb = embs[labels == i].mean(0)
+                spk_center.append(spk_emb)
+            assert len(spk_center) > 0
+            spk_center = np.stack(spk_center, axis=0)
+            norm_spk_center = spk_center / np.linalg.norm(spk_center, axis=1, keepdims=True)
+            affinity = np.matmul(norm_spk_center, norm_spk_center.T)
+            affinity = np.triu(affinity, 1)
+            spks = np.unravel_index(np.argmax(affinity), affinity.shape)
+            if affinity[spks] < cos_thr:
+                break
+            for i in range(len(labels)):
+                if labels[i] == spks[1]:
+                    labels[i] = spks[0]
+                elif labels[i] > spks[1]:
+                    labels[i] -= 1
+        return labels
diff --git a/qwen_asr/inference/qwen3_asr.py b/qwen_asr/inference/qwen3_asr.py
@@ -557,7 +557,8 @@ def _offset_align_result(self, result: Any, offset_sec: float) -> Any:
         for it in result.items:
             items.append(type(it)(text=it.text, 
                                   start_time=round(it.start_time + offset_sec, 3), 
-                                  end_time=round(it.end_time + offset_sec, 3)))
+                                  end_time=round(it.end_time + offset_sec, 3),
+                                  speaker=getattr(it, 'speaker', None)))
         return type(result)(items=items)
 
     def _merge_align_results(self, results: List[Any]) -> Optional[Any]:
@@ -733,17 +734,8 @@ def streaming_transcribe(self, pcm16k: np.ndarray, state: ASRStreamingState) ->
                 prefix = ""
             else:
                 cur_ids = self.processor.tokenizer.encode(state._raw_decoded)
-                k = int(state.unfixed_token_num)
-                while True:
-                    end_idx = max(0, len(cur_ids) - k)
-                    prefix = self.processor.tokenizer.decode(cur_ids[:end_idx]) if end_idx > 0 else ""
-                    if '\ufffd' not in prefix:
-                        break
-                    else:
-                        if end_idx == 0:
-                            prefix = ""
-                            break
-                        k += 1
+                end_idx = max(1, len(cur_ids) - int(state.unfixed_token_num))
+                prefix = self.processor.tokenizer.decode(cur_ids[:end_idx])
 
             prompt = state.prompt_raw + prefix