Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions examples/inference-server/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Dockerfile of qwenllm/qwen3-asr:cu128

ARG CUDA_VERSION=12.8.0
ARG from=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
FROM ${from} AS base

ARG DEBIAN_FRONTEND=noninteractive
RUN <<EOF
apt update -y && apt upgrade -y && apt install -y --no-install-recommends \
git \
git-lfs \
python3 \
python3-pip \
python3-dev \
wget \
vim \
libsndfile1 \
ccache \
software-properties-common \
ffmpeg \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
EOF

RUN wget https://github.com/Kitware/CMake/releases/download/v3.26.1/cmake-3.26.1-Linux-x86_64.sh \
-q -O /tmp/cmake-install.sh \
&& chmod u+x /tmp/cmake-install.sh \
&& mkdir /opt/cmake-3.26.1 \
&& /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake-3.26.1 \
&& rm /tmp/cmake-install.sh \
&& ln -s /opt/cmake-3.26.1/bin/* /usr/local/bin

RUN ln -s /usr/bin/python3 /usr/bin/python

RUN git lfs install

WORKDIR /data/shared/Qwen3-ASR

ENV MAX_JOBS=32
ENV NVCC_THREADS=2
ENV CCACHE_DIR=/root/.cache/ccache

ARG BUNDLE_FLASH_ATTENTION=true

RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install -U pip setuptools wheel

RUN apt remove python3-blinker -y


RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install -U "qwen-asr[vllm]" fastapi uvicorn python-multipart requests soundfile scipy websockets

RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then \
pip3 install -U flash-attn --no-build-isolation git+https://github.com/Dao-AILab/flash-attention.git; \
fi

RUN rm -rf /root/.cache/pip

EXPOSE 80

# server.py will be mounted at runtime
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
115 changes: 115 additions & 0 deletions examples/inference-server/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
IMAGE ?= qwenllm/qwen3-asr:cu128
CONTAINER ?= qwen3-asr-fast
PORT ?= 8907
HF_HOME ?= /var/lib/docker/container_volumes/hf_models
VLLM_CACHE_ROOT ?= $(HF_HOME)/vllm_cache
FILES_DIR ?= files
TEST_URL_LIST ?= https://raw.githubusercontent.com/kyr0/asr-sample-files/refs/heads/main/index.m3u
GPU_MEM_UTIL ?= 0.15
ASR_MODEL_NAME ?= Qwen/Qwen3-ASR-1.7B
ALIGNER_MODEL_NAME ?= Qwen/Qwen3-ForcedAligner-0.6B

.PHONY: build up down logs health test setup-local test-streaming test-batch test-aligner benchmark-streaming benchmark-batch

build:
docker build --rm -t $(IMAGE) .

up:
@echo "Starting container with:"
@echo " IMAGE: $(IMAGE)"
@echo " CONTAINER: $(CONTAINER)"
@echo " PORT: $(PORT)"
@echo " HF_HOME: $(HF_HOME)"
@echo " VLLM_CACHE_ROOT: $(VLLM_CACHE_ROOT)"
@echo " GPU_MEM_UTIL: $(GPU_MEM_UTIL)"
@echo " ASR_MODEL_NAME: $(ASR_MODEL_NAME)"
@echo " ALIGNER_MODEL_NAME: $(ALIGNER_MODEL_NAME)"
@read -p "Proceed? [y/N] " ans && [ $${ans:-N} = y ] || (echo "Aborted." && exit 1)
mkdir -p $(FILES_DIR)
docker run -d --name $(CONTAINER) \
--gpus all --ipc=host \
-p $(PORT):8000 \
-e HF_HOME="/data/shared/hf_models" \
-e VLLM_CACHE_ROOT="/data/shared/hf_models/vllm_cache" \
-e ENABLE_ASR_MODEL=true \
-e ENABLE_ALIGNER_MODEL=true \
-e ASR_MODEL_NAME=$(ASR_MODEL_NAME) \
-e ALIGNER_MODEL_NAME=$(ALIGNER_MODEL_NAME) \
-e GPU_MEMORY_UTILIZATION=$(GPU_MEM_UTIL) \
-v $(PWD):/data/shared/Qwen3-ASR \
-v $(HF_HOME):/data/shared/hf_models \
$(IMAGE)
@$(MAKE) logs

down:
-docker rm -f $(CONTAINER)

logs:
docker logs -f $(CONTAINER)

health:
@echo "Checking health..."
@curl -s http://localhost:$(PORT)/health | jq .

test:
@echo "Running verification test..."
@mkdir -p $(FILES_DIR)
@# Download samples if missing
@for ext in m4a mp3 wav; do \
if [ ! -f "$(FILES_DIR)/reference.$$ext" ]; then \
echo "Downloading reference.$$ext..."; \
wget -q -O $(FILES_DIR)/reference.$$ext https://raw.githubusercontent.com/kyr0/asr-sample-files/refs/heads/main/reference.$$ext; \
fi \
done

@pass=0; fail=0; \
for ext in m4a mp3 wav; do \
file="$(FILES_DIR)/reference.$$ext"; \
echo "Testing $$file"; \
resp=$$(curl -s -X POST -F "files=@$$file" "http://localhost:$(PORT)/transcribe?language=de"); \
echo "Response: $$resp"; \
if echo "$$resp" | grep -iq "referenz"; then \
echo "PASS: 'referenz' found."; \
pass=$$((pass+1)); \
else \
echo "FAIL: 'referenz' not found."; \
fail=$$((fail+1)); \
fi; \
done; \
echo "Results: $$pass PASSED, $$fail FAILED"; \
if [ $$fail -gt 0 ]; then exit 1; fi

setup-local:
@echo "Setting up local Python environment..."
@if [ ! -d ".venv" ]; then python3 -m venv .venv; fi
@. .venv/bin/activate && pip install -r requirements.txt
@echo "Setting up local Node.js environment..."
@npm install --silent

test-streaming:
@echo "Running Python Streaming Test..."
@. .venv/bin/activate && python client-streaming.py -e ws://127.0.0.1:$(PORT)/transcribe-streaming -f $(FILES_DIR)/reference.pcm
@echo "\nRunning Node.js Streaming Test..."
@node client-streaming.js -e ws://127.0.0.1:$(PORT)/transcribe-streaming -f $(FILES_DIR)/reference.pcm

test-batch:
@echo "Running Batch Verification Test..."
@# Send multiple files (reusing the same file 3 times for batching test)
@curl -s -X POST "http://127.0.0.1:$(PORT)/transcribe?language=de" \
-F "files=@$(FILES_DIR)/reference.m4a" \
-F "files=@$(FILES_DIR)/reference.mp3" \
-F "files=@$(FILES_DIR)/reference.wav" | jq .

test-aligner:
@echo "Running Forced Alignment Verification Test..."
@curl -s -X POST "http://127.0.0.1:$(PORT)/transcribe?language=de&forced_alignment=true" \
-F "files=@$(FILES_DIR)/reference.wav" | jq .

benchmark-streaming:
@echo "Running Streaming Benchmark (Concurrency: 4, Requests: 20)..."
@. .venv/bin/activate && python benchmark.py --mode streaming --url ws://127.0.0.1:$(PORT)/transcribe-streaming --file $(FILES_DIR)/reference.pcm --clients 4 --requests 20

benchmark-batch:
@echo "Running Batch Benchmark (Concurrency: 4, Requests: 20)..."
@. .venv/bin/activate && python benchmark.py --mode batch --url http://127.0.0.1:$(PORT)/transcribe --file $(FILES_DIR)/reference.wav --clients 4 --requests 20

Loading