vllm-project
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/pipeline.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/test-nightly.yml‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/test-nightly.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/test-ready.yml‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/test-ready.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 3 additions & 3 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker/Dockerfile.ci‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile.ci‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.rocm‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile.rocm‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/.nav.yml‎
Lines changed: 9 additions & 2 deletions b/‎docs/.nav.yml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎docs/api/README.md‎
Lines changed: 9 additions & 1 deletion b/‎docs/api/README.md‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎docs/contributing/profiling.md‎
Lines changed: 0 additions & 3 deletions b/‎docs/contributing/profiling.md‎
Lines changed: 0 additions & 3 deletions
@@ -3,7 +3,7 @@ steps:
     key: image-build
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ."
+      - "docker build --progress=plain --file docker/Dockerfile.ci -t vllm-omni-ci ."
       - "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
     agents:
 
@@ -13,7 +13,7 @@ steps:
       - kubernetes:
           podSpec:
             containers:
-              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
                 resources:
                   limits:
                     nvidia.com/gpu: 2
@@ -69,7 +69,7 @@ steps:
       - kubernetes:
           podSpec:
             containers:
-              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
                 resources:
                   limits:
                     nvidia.com/gpu: 2
 
@@ -132,7 +132,7 @@ steps:
   #     - kubernetes:
   #         podSpec:
   #           containers:
-  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
   #               resources:
   #                 limits:
   #                   nvidia.com/gpu: 2
@@ -192,7 +192,7 @@ steps:
   #     - kubernetes:
   #         podSpec:
   #           containers:
-  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
   #               resources:
   #                 limits:
   #                   nvidia.com/gpu: 2
@@ -251,7 +251,7 @@ steps:
   #     - kubernetes:
   #         podSpec:
   #           containers:
-  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
   #               resources:
   #                 limits:
   #                   nvidia.com/gpu: 1
@@ -288,7 +288,7 @@ steps:
   #     - kubernetes:
   #         podSpec:
   #           containers:
-  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
   #               resources:
   #                 limits:
   #                   nvidia.com/gpu: 1
 
@@ -12,10 +12,10 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
 
 - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
-- [ ] The test plan. Please providing the test scripts & test commands. Please state the reasons if your codes don't require additional test scripts. For test file guidelines, please check the [test style doc](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_style/)
-- [ ] The test results. Please pasting the results comparison before and after, or e2e results.
+- [ ] The test plan. Please provide the test scripts & test commands. Please state the reasons if your codes don't require additional test scripts. For test file guidelines, please check the [test style doc](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_style/)
+- [ ] The test results. Please paste the results comparison before and after, or the e2e results.
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. **Please run `mkdocs serve` to sync the documentation editions to `./docs`.**
-- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft.
+- [ ] (Optional) Release notes update. If your change is user-facing, please update the release notes draft.
 </details>
 
 **BEFORE SUBMITTING, PLEASE READ <https://github.com/vllm-project/vllm-omni/blob/main/CONTRIBUTING.md>** (anything written below this line will be removed by GitHub Actions)
@@ -243,5 +243,7 @@ Dockerfile.dev
 discussion
 tmp_test
 
+# Auto-generated version file (created by setuptools_scm during build)
+vllm_omni/_version.py
 # output files
 *.wav
@@ -8,7 +8,7 @@ COPY . .
 
 # Install system dependencies
 RUN apt-get update && \
-    apt-get install -y ffmpeg sox libsox-fmt-all jq && \
+    apt-get install -y ffmpeg git sox libsox-fmt-all jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 
@@ -19,7 +19,7 @@ WORKDIR ${COMMON_WORKDIR}
 
 # Step 1: Setup - Install system dependencies
 RUN apt-get update && \
-    apt-get install -y ffmpeg && \
+    apt-get install -y ffmpeg git sox libsox-fmt-all jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 
@@ -13,9 +13,9 @@ nav:
     - examples/README.md
     - Offline Inference:
       - BAGEL-7B-MoT: user_guide/examples/offline_inference/bagel.md
+      - GLM-Image Multistage End-to-End Inference: user_guide/examples/offline_inference/glm_image.md
       - Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
       - Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
-      - LoRA Inference(Diffusion): user_guide/examples/offline_inference/lora_inference.md
       - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
       - Qwen3-TTS: user_guide/examples/offline_inference/qwen3_tts.md
@@ -24,12 +24,14 @@ nav:
       - Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
     - Online Serving:
       - BAGEL-7B-MoT: user_guide/examples/online_serving/bagel.md
+      - GLM-Image Online Serving: user_guide/examples/online_serving/glm_image.md
       - Image-To-Image: user_guide/examples/online_serving/image_to_image.md
-      - LoRA Inference(Diffusion): user_guide/examples/online_serving/lora_inference.md
+      - Image-To-Video: user_guide/examples/online_serving/image_to_video.md
       - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
       - Qwen3-TTS: user_guide/examples/online_serving/qwen3_tts.md
       - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
+      - Text-To-Video: user_guide/examples/online_serving/text_to_video.md
   - General:
     - usage/*
   - Configuration:
@@ -46,13 +48,17 @@ nav:
       - Quantization:
         - Overview: user_guide/diffusion/quantization/overview.md
         - FP8: user_guide/diffusion/quantization/fp8.md
+        - GGUF: user_guide/diffusion/quantization/gguf.md
       - Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md
       - CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md
       - LoRA: user_guide/diffusion/lora.md
+      - Hybrid Sharded Data Parallel: design/feature/hsdp.md
+      - Custom Pipeline: features/custom_pipeline.md
     - ComfyUI: features/comfyui.md
 - Developer Guide:
   - General:
     - contributing/README.md
+    - pr_reviewer.md
     - glob: contributing/*
       flatten_single_child_sections: true
   - Model Implementation:
@@ -72,6 +78,7 @@ nav:
       - design/feature/tensor_parallel.md
       - design/feature/cache_dit.md
       - design/feature/teacache.md
+      - design/feature/async_chunk_design.md
     - Module Design:
       - design/module/ar_module.md
       - design/module/dit_module.md
 
@@ -7,7 +7,6 @@ Main entry points for vLLM-Omni inference and serving.
 - [vllm_omni.entrypoints.async_omni.AsyncOmni][]
 - [vllm_omni.entrypoints.async_omni_diffusion.AsyncOmniDiffusion][]
 - [vllm_omni.entrypoints.async_omni_llm.AsyncOmniLLM][]
-- [vllm_omni.entrypoints.chat_utils.extract_audio_from_video_async][]
 - [vllm_omni.entrypoints.cli.benchmark.base.OmniBenchmarkSubcommandBase][]
 - [vllm_omni.entrypoints.cli.benchmark.main.OmniBenchmarkSubcommand][]
 - [vllm_omni.entrypoints.cli.benchmark.serve.OmniBenchmarkServingSubcommand][]
@@ -19,6 +18,7 @@ Main entry points for vLLM-Omni inference and serving.
 - [vllm_omni.entrypoints.omni_llm.OmniLLM][]
 - [vllm_omni.entrypoints.omni_stage.OmniStage][]
 - [vllm_omni.entrypoints.stage_utils.OmniStageTaskType][]
+- [vllm_omni.entrypoints.zmq_utils.ZmqQueue][]
 
 ## Inputs
 
@@ -36,6 +36,12 @@ Input data structures for multi-modal inputs.
 Engine classes for offline and online inference.
 
 - [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
+- [vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector.BufferAllocator][]
+- [vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector.ManagedBuffer][]
+- [vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector.MooncakeAgentMetadata][]
+- [vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector.MooncakeTransferEngineConnector][]
+- [vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector.QueryRequest][]
+- [vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector.QueryResponse][]
 - [vllm_omni.engine.AdditionalInformationEntry][]
 - [vllm_omni.engine.AdditionalInformationPayload][]
 - [vllm_omni.engine.OmniEngineCoreOutput][]
@@ -89,8 +95,10 @@ Configuration classes.
 Worker classes and model runners for distributed inference.
 
 - [vllm_omni.diffusion.worker.diffusion_model_runner.DiffusionModelRunner][]
+- [vllm_omni.diffusion.worker.diffusion_worker.CustomPipelineWorkerExtension][]
 - [vllm_omni.diffusion.worker.diffusion_worker.DiffusionWorker][]
 - [vllm_omni.diffusion.worker.diffusion_worker.WorkerProc][]
+- [vllm_omni.diffusion.worker.diffusion_worker.WorkerWrapperBase][]
 - [vllm_omni.platforms.npu.worker.npu_ar_model_runner.ExecuteModelState][]
 - [vllm_omni.platforms.npu.worker.npu_ar_model_runner.NPUARModelRunner][]
 - [vllm_omni.platforms.npu.worker.npu_ar_worker.NPUARWorker][]
 
@@ -131,9 +131,6 @@ python image_to_video.py \
 
 2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**:   [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video)
 
-> **Note:**
-As of now, asynchronous (online) profiling is not fully supported in vLLM-Omni. While start_profile() and stop_profile() methods exist, they are only reliable in offline inference scripts (e.g., the provided end2end.py examples). Do not use them in server-mode or streaming scenarios—traces may be incomplete or fail to flush.
-
 ### 4. Analyzing Omni Traces
 
 Output files are saved to your configured ```VLLM_TORCH_PROFILER_DIR```.