EmbeddedLLM
diff --git a/‎docs/.nav.yml‎
Lines changed: 1 addition & 0 deletions b/‎docs/.nav.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/api/README.md‎
Lines changed: 16 additions & 33 deletions b/‎docs/api/README.md‎
Lines changed: 16 additions & 33 deletions
diff --git a/‎docs/mkdocs/hooks/generate_api_readme.py‎
Lines changed: 4 additions & 4 deletions b/‎docs/mkdocs/hooks/generate_api_readme.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/models/supported_models.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/models/supported_models.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/user_guide/examples/offline_inference/qwen3_tts.md‎
Lines changed: 94 additions & 0 deletions b/‎docs/user_guide/examples/offline_inference/qwen3_tts.md‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎docs/user_guide/examples/online_serving/text_to_image.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/user_guide/examples/online_serving/text_to_image.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/offline_inference/qwen3_tts/README.md‎
Lines changed: 84 additions & 0 deletions b/‎examples/offline_inference/qwen3_tts/README.md‎
Lines changed: 84 additions & 0 deletions
@@ -14,6 +14,7 @@ nav:
       - Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
       - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
+      - Qwen3-TTS Offline Inference: user_guide/examples/offline_inference/qwen3_tts.md
       - Text-To-Image: user_guide/examples/offline_inference/text_to_image.md
       - Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
     - Online Serving:
 
@@ -36,7 +36,6 @@ Input data structures for multi-modal inputs.
 
 Engine classes for offline and online inference.
 
-- [vllm_omni.diffusion.diffusion_engine.BackgroundResources][]
 - [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
 - [vllm_omni.engine.AdditionalInformationEntry][]
 - [vllm_omni.engine.AdditionalInformationPayload][]
@@ -57,38 +56,11 @@ Core scheduling and caching components.
 - [vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler][]
 - [vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler][]
 - [vllm_omni.core.sched.output.OmniNewRequestData][]
-
-## Model Executor
-
-Model execution components.
-
-- [vllm_omni.model_executor.custom_process_mixin.CustomProcessMixin][]
-- [vllm_omni.model_executor.models.output_templates.OmniOutput][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni.Qwen2_5OmniForConditionalGeneration][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_talker.Qwen2_5OmniTalkerForConditionalGeneration][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker.Qwen2_5OmniConditionalGenerationMixin][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker.Qwen2_5OmniThinkerForConditionalGeneration][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_token2wav.Qwen2_5OmniToken2WavBigVGANModel][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_token2wav.Qwen2_5OmniToken2WavDiTModel][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_token2wav.Qwen2_5OmniToken2WavForConditionalGenerationVLLM][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_token2wav.Qwen2_5OmniToken2WavModel][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2EmbeddingModel][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2ForCausalLM][]
-- [vllm_omni.model_executor.models.qwen2_5_omni.qwen2_old.Qwen2Model][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_moe.Qwen3MoeForCausalLM][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni.Qwen3OmniMoeForConditionalGeneration][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_code2wav.Qwen3OmniMoeCode2Wav][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniCodePredictorBaseModel][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniMoeTalkerCodePredictor][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeModel][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerForConditionalGeneration][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerSharedExpertWrapper][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMForCausalLM][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMModel][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeConditionalGenerationMixin][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerForConditionalGeneration][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerMultiModalProcessor][]
-- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerProcessingInfo][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedGroupResidualVectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedResidualVectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.EuclideanCodebook][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.VectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.preprocess][]
 
 ## Configuration
 
@@ -98,6 +70,17 @@ Configuration classes.
 - [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
 - [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
 - [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSSpeakerEncoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerCodePredictorConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2Config][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2DecoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1Config][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderBigVGANConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
 
 ## Workers
 
 
@@ -34,10 +34,10 @@
         "name": "Core",
         "description": "Core scheduling and caching components.",
     },
-    "model_executor": {
-        "name": "Model Executor",
-        "description": "Model execution components.",
-    },
+    # "model_executor": {
+    #     "name": "Model Executor",
+    #     "description": "Model execution components.",
+    # },
     "config": {
         "name": "Configuration",
         "description": "Configuration classes.",
 
@@ -34,6 +34,9 @@ th {
 |`StableDiffusion3Pipeline` | Stable-Diffusion-3 | `stabilityai/stable-diffusion-3.5-medium` |
 |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
 |`StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` |
+|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
+|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` |
+|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` |
 
 
 ## List of Supported Models for NPU
 
@@ -0,0 +1,94 @@
+# Qwen3-TTS Offline Inference
+
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/qwen3_tts>.
+
+
+This directory contains an offline demo for running Qwen3 TTS models with vLLM Omni. It builds task-specific inputs and generates WAV files locally.
+
+## Model Overview
+
+Qwen3 TTS provides multiple task variants for speech generation:
+
+- **CustomVoice**: Generate speech with a known speaker identity (speaker ID) and optional instruction.
+- **VoiceDesign**: Generate speech from text plus a descriptive instruction that designs a new voice.
+- **Base**: Voice cloning using a reference audio + reference transcript, with optional mode selection.
+
+## Setup
+Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup.
+
+## Quick Start
+
+Run a single sample for a task:
+
+```
+python end2end.py --query-type CustomVoice
+```
+
+Generated audio files are saved to `output_audio/` by default.
+
+## Task Usage
+
+### CustomVoice
+
+Single sample:
+
+```
+python end2end.py --query-type CustomVoice
+```
+
+Batch sample (multiple prompts in one run):
+
+```
+python end2end.py --query-type CustomVoice --use-batch-sample
+```
+
+### VoiceDesign
+
+Single sample:
+
+```
+python end2end.py --query-type VoiceDesign
+```
+
+Batch sample:
+
+```
+python end2end.py --query-type VoiceDesign --use-batch-sample
+```
+
+### Base (Voice Clone)
+
+Single sample:
+
+```
+python end2end.py --query-type Base
+```
+
+Batch sample:
+
+```
+python end2end.py --query-type Base --use-batch-sample
+```
+
+Mode selection for Base:
+
+- `--mode-tag icl` (default): standard mode
+- `--mode-tag xvec_only`: enable `x_vector_only_mode` in the request
+
+Examples:
+
+```
+python end2end.py --query-type Base --mode-tag icl
+```
+
+## Notes
+
+- The script uses the model paths embedded in `end2end.py`. Update them if your local cache path differs.
+- Use `--output-dir` (preferred) or `--output-wav` to change the output folder.
+
+## Example materials
+
+??? abstract "end2end.py"
+    ``````py
+    --8<-- "examples/offline_inference/qwen3_tts/end2end.py"
+    ``````
@@ -119,6 +119,7 @@ Use `extra_body` to pass generation parameters:
 | `seed`                   | int   | None    | Random seed (reproducible)     |
 | `negative_prompt`        | str   | None    | Negative prompt                |
 | `num_outputs_per_prompt` | int   | 1       | Number of images to generate   |
+| `--cfg-parallel-size`.   | int   | 1       | Number of GPUs for CFG parallelism |
 
 ## Response Format
 
 
@@ -0,0 +1,84 @@
+# Qwen3-TTS Offline Inference
+
+This directory contains an offline demo for running Qwen3 TTS models with vLLM Omni. It builds task-specific inputs and generates WAV files locally.
+
+## Model Overview
+
+Qwen3 TTS provides multiple task variants for speech generation:
+
+- **CustomVoice**: Generate speech with a known speaker identity (speaker ID) and optional instruction.
+- **VoiceDesign**: Generate speech from text plus a descriptive instruction that designs a new voice.
+- **Base**: Voice cloning using a reference audio + reference transcript, with optional mode selection.
+
+## Setup
+Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup.
+
+## Quick Start
+
+Run a single sample for a task:
+
+```
+python end2end.py --query-type CustomVoice
+```
+
+Generated audio files are saved to `output_audio/` by default.
+
+## Task Usage
+
+### CustomVoice
+
+Single sample:
+
+```
+python end2end.py --query-type CustomVoice
+```
+
+Batch sample (multiple prompts in one run):
+
+```
+python end2end.py --query-type CustomVoice --use-batch-sample
+```
+
+### VoiceDesign
+
+Single sample:
+
+```
+python end2end.py --query-type VoiceDesign
+```
+
+Batch sample:
+
+```
+python end2end.py --query-type VoiceDesign --use-batch-sample
+```
+
+### Base (Voice Clone)
+
+Single sample:
+
+```
+python end2end.py --query-type Base
+```
+
+Batch sample:
+
+```
+python end2end.py --query-type Base --use-batch-sample
+```
+
+Mode selection for Base:
+
+- `--mode-tag icl` (default): standard mode
+- `--mode-tag xvec_only`: enable `x_vector_only_mode` in the request
+
+Examples:
+
+```
+python end2end.py --query-type Base --mode-tag icl
+```
+
+## Notes
+
+- The script uses the model paths embedded in `end2end.py`. Update them if your local cache path differs.
+- Use `--output-dir` (preferred) or `--output-wav` to change the output folder.