[Doc]: fixing typos to improve docs (vllm-project#24480)

didier-durand · skyloevil · commit ba81b6d109f9 · 2025-09-13T12:40:21.000+08:00
Signed-off-by: Didier Durand &lt;durand.didier@gmail.com&gt;
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
@@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported.
 
 The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
 
-Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
+Other tool calling formats like the built-in python tool calling or custom tool calling are not supported.
 
 Known issues:
 
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels.
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
     !!! tip
-        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
         - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
         - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
         - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja
@@ -9,7 +9,7 @@
 <|system|>
 {{ system_message }}
 {%- if tools %}
-In addition to plain text responses, you can chose to call one or more of the provided functions.
+In addition to plain text responses, you can choose to call one or more of the provided functions.
 
 Use the following rule to decide when to call a function:
   * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
@@ -19,7 +19,7 @@ If you decide to call functions:
   * prefix function calls with functools marker (no closing marker required)
   * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
   * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
-  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
   * make sure you pick the right functions that match the user intent
 
 
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
@@ -25,7 +25,7 @@ def collective_rpc(self,
                        timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
         return super().collective_rpc(method, timeout, args, kwargs)
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -79,7 +79,7 @@ def disable_connect(*args, **kwargs):
             )
 
             # Need to re-import huggingface_hub
-            # and friends to setup offline mode
+            # and friends to set up offline mode
             _re_import_modules()
             # Cached model files should be used in offline mode
             for model_config in MODEL_CONFIGS:
@@ -136,7 +136,7 @@ def disable_connect(*args, **kwargs):
                 disable_connect,
             )
             # Need to re-import huggingface_hub
-            # and friends to setup offline mode
+            # and friends to set up offline mode
             _re_import_modules()
             engine_args = EngineArgs(model="facebook/opt-125m")
             LLM(**dataclasses.asdict(engine_args))
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
     # then we would expand a to:
     #       a = [[1, 1, 2, 2],
     #            [3, 3, 4, 4]]
-    # NOTE this function this function does not explicitly broadcast dimensions
+    # NOTE this function does not explicitly broadcast dimensions
     # with an extent of 1, since this can be done implicitly by pytorch
     def group_broadcast(t, shape):
         for i, s in enumerate(shape):
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
@@ -301,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     finished_requests_ids is larger than the maximum mamba block capacity.
 
     This could generally happen due to the fact that hybrid does support
-    statelessness mechanism where it can cleanup new incoming requests in
+    statelessness mechanism where it can clean up new incoming requests in
     a single step.
     """
     try:
@@ -322,7 +322,7 @@ def test_state_cleanup(
     This test is for verifying that the Hybrid state is cleaned up between
     steps.
     
-    If its not cleaned, an error would be expected.
+    If it's not cleaned, an error would be expected.
     """
     try:
         with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
@@ -28,7 +28,7 @@ def get_model_args(self) -> str:
         expected_value=0.76),  # no bias
     # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
     # so only one of these tests can run in a single call to pytest. As
-    # a follow up, move this into the LM-EVAL section of the CI.
+    # a follow-up, move this into the LM-EVAL section of the CI.
     # GSM8KAccuracyTestConfig(
     #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
     #     expected_value=0.66),  # bias in QKV layers
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -1117,7 +1117,7 @@ def initialize_model_parallel(
         "decode context model parallel group is already initialized")
     # Note(hc): In the current implementation of decode context parallel,
     # dcp_size must not exceed tp_size, because the world size does not
-    # change by DCP, it simply reuse the GPUs of TP group, and split one
+    # change by DCP, it simply reuses the GPUs of TP group, and split one
     # TP group into tp_size//dcp_size DCP groups.
     group_ranks = all_ranks.reshape(
         -1, decode_context_model_parallel_size).unbind(0)