Merge branch 'main' into support_sharegpt

tukwila · web-flow · commit ffa782ee4a81 · 2025-09-17T17:24:57.000+08:00
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -217,7 +217,7 @@ jobs:
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./ui/out
+          publish_dir: ./src/ui/out
           destination_dir: ui/dev
           keep_files: false
           user_name: ${{ github.actor }}
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -238,7 +238,7 @@ jobs:
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./ui/out
+          publish_dir: ./src/ui/out
           destination_dir: ui/nightly
           keep_files: false
           user_name: ${{ github.actor }}
diff --git a/.github/workflows/release-candidate.yml b/.github/workflows/release-candidate.yml
@@ -282,7 +282,7 @@ jobs:
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./ui/out
+          publish_dir: ./src/ui/out
           destination_dir: ui/release/latest
           keep_files: false
           user_name: ${{ github.actor }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -281,7 +281,7 @@ jobs:
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./ui/out
+          publish_dir: ./src/ui/out
           destination_dir: ui/latest
           keep_files: false
           user_name: ${{ github.actor }}
diff --git a/docs/backends.md b/docs/backends.md
@@ -40,6 +40,24 @@ docker run --gpus 1 -ti --shm-size 1g --ipc=host --rm -p 8080:80 \
 
 For more information on starting a TGI server, see the [TGI Documentation](https://huggingface.co/docs/text-generation-inference/index).
 
+### 3. llama.cpp
+
+[llama.cpp](https://github.com/ggml-org/llama.cpp) provides lightweight, OpenAI-compatible server through its [llama-server](https://github.com/ggml-org/llama.cpp/blob/master/tools/server) tool.
+
+To start a llama.cpp server with the gpt-oss-20b model, you can use the following command:
+
+```bash
+llama-server -hf ggml-org/gpt-oss-20b-GGUF --alias gpt-oss-20b --ctx-size 0 --jinja -ub 2048 -b 2048
+```
+
+Note that we are providing an alias `gpt-oss-20b` for the model name because `guidellm` is using it to retrieve model metadata in JSON format and such metadata is not included in GGUF model repositories. A simple workaround is to download the metadata files from safetensors repository and place them in a local directory named after the alias:
+
+```bash
+huggingface-cli download openai/gpt-oss-20b --include "*.json" --local-dir gpt-oss-20b/
+```
+
+Now you can run `guidellm` as usual and it will be able to fetch the model metadata from the local directory.
+
 ## Expanding Backend Support
 
 GuideLLM is an open platform, and we encourage contributions to extend its backend support. Whether it's adding new server implementations, integrating with Python-based backends, or enhancing existing capabilities, your contributions are welcome. For more details on how to contribute, see the [CONTRIBUTING.md](https://github.com/vllm-project/guidellm/blob/main/CONTRIBUTING.md) file.
diff --git a/pdm.toml b/pdm.toml
@@ -1,2 +1,4 @@
+[strategy]
+update = "reuse"
 [lock]
 format = "pylock"
diff --git a/pylock.toml b/pylock.toml
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,10 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+recommended = [
+    "tiktoken>=0.11.0",  # For OpenAI tokenizer
+    "blobfile>=3.1.0",   # For OpenAI tokenizer
+]
 dev = [
     # build
     "build>=1.0.0",
@@ -102,7 +106,6 @@ dev = [
     "mkdocs-linkcheck~=1.0.6",
 ]
 
-# For PEP 735 compliant tools
 [dependency-groups]
 dev = [ "guidellm[dev]" ]
 
diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
@@ -688,7 +688,7 @@ def _extract_completions_delta_content(
             return data["choices"][0]["text"]
 
         if type_ == "chat_completions":
-            return data["choices"][0]["delta"]["content"]
+            return data.get("choices", [{}])[0].get("delta", {}).get("content")
 
         raise ValueError(f"Unsupported type: {type_}")
 
diff --git a/src/guidellm/presentation/data_models.py b/src/guidellm/presentation/data_models.py
@@ -190,7 +190,7 @@ class TabularDistributionSummary(DistributionSummary):
     """
 
     @computed_field
-    def percentile_rows(self) -> list[dict[str, float]]:
+    def percentile_rows(self) -> list[dict[str, Union[str, float]]]:
         rows = [
             {"percentile": name, "value": value}
             for name, value in self.percentiles.model_dump().items()
diff --git a/src/ui/lib/components/MetricsSummary/MetricsSummary.component.tsx b/src/ui/lib/components/MetricsSummary/MetricsSummary.component.tsx
@@ -298,7 +298,7 @@ export const Component = () => {
             color="surface.onSurface"
             textTransform="uppercase"
           >
-            Maximum RPS per gpu
+            Measured RPS (Mean)
           </Typography>
           <Typography variant="metric1" color="primary">
             {formatNumber(currentRequestRate)} rps