diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 83b79c972..85e832de1 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -9,7 +9,7 @@ concurrency: jobs: lint: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup-python/ diff --git a/llama-cpp-server/README.md b/llama-cpp-server/README.md index 4cbaa7b2a..01cdb580b 100644 --- a/llama-cpp-server/README.md +++ b/llama-cpp-server/README.md @@ -23,4 +23,4 @@ cd llama.cpp docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile . ``` -You can then push this image to a container registry of your choice and then replace the base_image in the config.yaml \ No newline at end of file +You can then push this image to a container registry of your choice and then replace the base_image in the config.yaml diff --git a/llama-cpp-server/config.yaml b/llama-cpp-server/config.yaml index 9286e6349..64ebb7a85 100644 --- a/llama-cpp-server/config.yaml +++ b/llama-cpp-server/config.yaml @@ -1,6 +1,6 @@ base_image: image: alphatozeta/llama-cpp-server:0.4 -build_commands: +build_commands: - pip install git+https://github.com/huggingface/transformers.git hf-xet model_metadata: repo_id: google/gemma-3-27b-it-qat-q4_0-gguf diff --git a/orpheus-best-performance/model/model.py b/orpheus-best-performance/model/model.py index 00c498f71..6d590ac93 100644 --- a/orpheus-best-performance/model/model.py +++ b/orpheus-best-performance/model/model.py @@ -3,7 +3,6 @@ import torch import fastapi from snac import SNAC -import struct from pathlib import Path import numpy as np from fastapi.responses import StreamingResponse @@ -276,7 +275,7 @@ async def predict( async def audio_stream(req_id: str): token_gen = await self._engine.predict(model_input, request) - + if isinstance(token_gen, StreamingResponse): token_gen = token_gen.body_iterator