Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<p>

<p align="center">
&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-asr">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-ASR">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3asr">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.21337">Paper</a>&nbsp&nbsp
&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-asr">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-ASR">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3asr">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-ASR/blob/main/assets/Qwen3_ASR.pdf">Paper</a>&nbsp&nbsp
<br>
🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-ASR">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-ASR">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-speech-recognition">API</a>

Expand Down Expand Up @@ -41,7 +41,7 @@ We release **Qwen3-ASR**, a family that includes two powerful all-in-one speech
- [Fine Tuning](#fine-tuning)
- [Docker](#docker)
- [Evaluation](#evaluation)
- [Citation](#citation)
<!-- - [Citation](#citation) -->


## Overview
Expand Down Expand Up @@ -1420,18 +1420,18 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
</details>


## Citation
<!-- ## Citation

If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)

```BibTeX
@article{Qwen3-ASR,
title={Qwen3-ASR Technical Report},
author={Xian Shi, Xiong Wang, Zhifang Guo, Yongqi Wang, Pei Zhang, Xinyu Zhang, Zishan Guo, Hongkun Hao, Yu Xi, Baosong Yang, Jin Xu, Jingren Zhou, Junyang Lin},
journal={arXiv preprint arXiv:2601.21337},
author={},
journal={arXiv preprint arXiv:},
year={2026}
}
```
``` -->


## Star History
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "qwen-asr"
version = "0.0.6"
version = "0.0.4"
description = "Qwen-ASR python package"
readme = "README.md"
requires-python = ">=3.9"
Expand Down
20 changes: 18 additions & 2 deletions qwen_asr/cli/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ def build_parser() -> argparse.ArgumentParser:
help="Qwen3-ForcedAligner checkpoint path or HF repo id (optional; enables timestamps when provided).",
)

parser.add_argument(
"--campplus-model",
default="FunAudioLLM/Fun-CosyVoice3-0.5B-2512/campplus.onnx",
help="Campplus model path for speaker diarization (optional).",
)

parser.add_argument(
"--backend",
default="transformers",
Expand Down Expand Up @@ -306,6 +312,7 @@ def _make_timestamp_html(audio_upload: Any, timestamps: Any) -> str:
word = str(item.get("text", "") or "")
start = item.get("start_time", None)
end = item.get("end_time", None)
speaker = item.get("speaker", None)
if start is None or end is None:
continue

Expand All @@ -328,13 +335,17 @@ def _make_timestamp_html(audio_upload: Any, timestamps: Any) -> str:
b64 = base64.b64encode(mem.read()).decode("utf-8")
audio_src = f"data:audio/wav;base64,{b64}"

speaker_label = f"Speaker {speaker}" if speaker is not None else ""
speaker_style = f"border-left: 4px solid hsl({(speaker % 12) * 30}, 70%, 50%);" if speaker is not None else ""

html_content += f"""
<div class="word-box">
<div class="word-box" style="{speaker_style}">
<div class="word-text">{word}</div>
<div class="word-time">{start} - {end} s</div>
<div class="word-time">{start:.2f} - {end:.2f} s</div>
<div class="word-audio">
<audio controls preload="none" src="{audio_src}"></audio>
</div>
{f'<div style="font-size: 11px; color: #888; margin-top: 4px;">{speaker_label}</div>' if speaker_label else ''}
</div>
"""

Expand Down Expand Up @@ -438,6 +449,7 @@ def run(audio_upload: Any, lang_disp: str, return_ts: bool):
text=getattr(t, "text", None),
start_time=getattr(t, "start_time", None),
end_time=getattr(t, "end_time", None),
speaker=getattr(t, "speaker", None),
)
for t in (getattr(r, "time_stamps", None) or [])
]
Expand Down Expand Up @@ -490,6 +502,10 @@ def main(argv=None) -> int:
user_backend_kwargs = _parse_json_dict(args.backend_kwargs, name="--backend-kwargs")
user_aligner_kwargs = _parse_json_dict(args.aligner_kwargs, name="--aligner-kwargs")

# Add campplus model path to aligner kwargs
if hasattr(args, "campplus_model") and args.campplus_model:
user_aligner_kwargs["campplus_model"] = args.campplus_model

backend_kwargs = _merge_dicts(_default_backend_kwargs(backend), user_backend_kwargs)
backend_kwargs = _coerce_special_types(backend_kwargs)

Expand Down
192 changes: 192 additions & 0 deletions qwen_asr/inference/cluster_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# Modified from 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker)

import scipy
import torch
import sklearn
import numpy as np

from sklearn.cluster._kmeans import k_means
from sklearn.cluster import HDBSCAN


class SpectralCluster:
r"""A spectral clustering mehtod using unnormalized Laplacian of affinity matrix.
This implementation is adapted from https://github.com/speechbrain/speechbrain.
"""

def __init__(self, min_num_spks=1, max_num_spks=15, pval=0.022):
self.min_num_spks = min_num_spks
self.max_num_spks = max_num_spks
self.pval = pval

def __call__(self, X, oracle_num=None):
# Similarity matrix computation
sim_mat = self.get_sim_mat(X)

# Refining similarity matrix with pval
prunned_sim_mat = self.p_pruning(sim_mat)

# Symmetrization
sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T)

# Laplacian calculation
laplacian = self.get_laplacian(sym_prund_sim_mat)

# Get Spectral Embeddings
emb, num_of_spk = self.get_spec_embs(laplacian, oracle_num)

# Perform clustering
labels = self.cluster_embs(emb, num_of_spk)

return labels

def get_sim_mat(self, X):
# Cosine similarities
M = sklearn.metrics.pairwise.cosine_similarity(X, X)
return M

def p_pruning(self, A):
if A.shape[0] * self.pval < 6:
pval = 6.0 / A.shape[0]
else:
pval = self.pval

n_elems = int((1 - pval) * A.shape[0])

# For each row in a affinity matrix
for i in range(A.shape[0]):
low_indexes = np.argsort(A[i, :])
low_indexes = low_indexes[0:n_elems]

# Replace smaller similarity values by 0s
A[i, low_indexes] = 0
return A

def get_laplacian(self, M):
M[np.diag_indices(M.shape[0])] = 0
D = np.sum(np.abs(M), axis=1)
D = np.diag(D)
L = D - M
return L

def get_spec_embs(self, L, k_oracle=None):
lambdas, eig_vecs = scipy.linalg.eigh(L)

if k_oracle is not None:
num_of_spk = k_oracle
else:
lambda_gap_list = self.getEigenGaps(
lambdas[self.min_num_spks - 1 : self.max_num_spks + 1]
)
num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks

emb = eig_vecs[:, :num_of_spk]
return emb, num_of_spk

def cluster_embs(self, emb, k):
_, labels, _ = k_means(emb, k)
return labels

def getEigenGaps(self, eig_vals):
eig_vals_gap_list = []
for i in range(len(eig_vals) - 1):
gap = float(eig_vals[i + 1]) - float(eig_vals[i])
eig_vals_gap_list.append(gap)
return eig_vals_gap_list


class UmapHdbscan:
r"""
Reference:
- Siqi Zheng, Hongbin Suo. Reformulating Speaker Diarization as Community Detection With
Emphasis On Topological Structure. ICASSP2022
"""

def __init__(
self, n_neighbors=20, n_components=60, min_samples=10, min_cluster_size=10, metric="cosine"
):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.min_samples = min_samples
self.min_cluster_size = min_cluster_size
self.metric = metric

def __call__(self, X):
import umap.umap_ as umap

umap_X = umap.UMAP(
n_neighbors=self.n_neighbors,
min_dist=0.0,
n_components=min(self.n_components, X.shape[0] - 2),
metric=self.metric,
).fit_transform(X)
labels = HDBSCAN(
min_samples=self.min_samples,
min_cluster_size=self.min_cluster_size,
allow_single_cluster=True,
).fit_predict(umap_X)
return labels


class ClusterBackend(torch.nn.Module):
r"""Perfom clustering for input embeddings and output the labels.
Args:
model_dir: A model dir.
model_config: The model config.
"""

def __init__(self, merge_thr=0.78):
super().__init__()
self.model_config = {"merge_thr": merge_thr}
# self.other_config = kwargs

self.spectral_cluster = SpectralCluster()
self.umap_hdbscan_cluster = UmapHdbscan()

def forward(self, X, **params):
# clustering and return the labels
k = params["oracle_num"] if "oracle_num" in params else None
assert len(X.shape) == 2, "modelscope error: the shape of input should be [N, C]"
if X.shape[0] < 20:
return np.zeros(X.shape[0], dtype="int")
if X.shape[0] < 2048 or k is not None:
# default
# unexpected corner case
labels = self.spectral_cluster(X, k)
else:
labels = self.umap_hdbscan_cluster(X)

if k is None and "merge_thr" in self.model_config:
labels = self.merge_by_cos(labels, X, self.model_config["merge_thr"])

return labels

def merge_by_cos(self, labels, embs, cos_thr):
# merge the similar speakers by cosine similarity
assert cos_thr > 0 and cos_thr <= 1
while True:
spk_num = labels.max() + 1
if spk_num == 1:
break
spk_center = []
for i in range(spk_num):
spk_emb = embs[labels == i].mean(0)
spk_center.append(spk_emb)
assert len(spk_center) > 0
spk_center = np.stack(spk_center, axis=0)
norm_spk_center = spk_center / np.linalg.norm(spk_center, axis=1, keepdims=True)
affinity = np.matmul(norm_spk_center, norm_spk_center.T)
affinity = np.triu(affinity, 1)
spks = np.unravel_index(np.argmax(affinity), affinity.shape)
if affinity[spks] < cos_thr:
break
for i in range(len(labels)):
if labels[i] == spks[1]:
labels[i] = spks[0]
elif labels[i] > spks[1]:
labels[i] -= 1
return labels
16 changes: 4 additions & 12 deletions qwen_asr/inference/qwen3_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,8 @@ def _offset_align_result(self, result: Any, offset_sec: float) -> Any:
for it in result.items:
items.append(type(it)(text=it.text,
start_time=round(it.start_time + offset_sec, 3),
end_time=round(it.end_time + offset_sec, 3)))
end_time=round(it.end_time + offset_sec, 3),
speaker=getattr(it, 'speaker', None)))
return type(result)(items=items)

def _merge_align_results(self, results: List[Any]) -> Optional[Any]:
Expand Down Expand Up @@ -733,17 +734,8 @@ def streaming_transcribe(self, pcm16k: np.ndarray, state: ASRStreamingState) ->
prefix = ""
else:
cur_ids = self.processor.tokenizer.encode(state._raw_decoded)
k = int(state.unfixed_token_num)
while True:
end_idx = max(0, len(cur_ids) - k)
prefix = self.processor.tokenizer.decode(cur_ids[:end_idx]) if end_idx > 0 else ""
if '\ufffd' not in prefix:
break
else:
if end_idx == 0:
prefix = ""
break
k += 1
end_idx = max(1, len(cur_ids) - int(state.unfixed_token_num))
prefix = self.processor.tokenizer.decode(cur_ids[:end_idx])

prompt = state.prompt_raw + prefix

Expand Down
Loading