Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit a72d13a

Browse files
committed
Merge remote-tracking branch 'ibm-vllm/main' into ibm_main_update_05162022
2 parents 059b81b + 2e81ed2 commit a72d13a

File tree

2 files changed

+6
-10
lines changed

2 files changed

+6
-10
lines changed

Dockerfile.ubi

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc
206206
ENV CCACHE_DIR=/root/.cache/ccache
207207
RUN --mount=type=cache,target=/root/.cache/ccache \
208208
--mount=type=cache,target=/root/.cache/pip \
209-
python setup.py bdist_wheel --dist-dir=dist
209+
VLLM_USE_PRECOMPILED=1 python3 setup.py bdist_wheel --dist-dir=dist
210210

211211
#################### FLASH_ATTENTION Build IMAGE ####################
212212
FROM dev as flash-attn-builder
@@ -258,7 +258,7 @@ RUN pip install \
258258
--no-binary="all" \
259259
--no-cache-dir \
260260
"vllm-nccl-cu12==2.18.1.0.4.0" && \
261-
mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/
261+
mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/
262262

263263
# Install flash attention (from pre-built wheel)
264264
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
@@ -277,7 +277,7 @@ ENV HF_HUB_OFFLINE=1 \
277277
PORT=8000 \
278278
GRPC_PORT=8033 \
279279
HOME=/home/vllm \
280-
VLLM_NCCL_SO_PATH=/opt/vllm/libnccl.so.2.18.1 \
280+
VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \
281281
VLLM_USAGE_SOURCE=production-docker-image
282282

283283
# setup non-root user for OpenShift

vllm/tgis_utils/metrics.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def observe_queue_time(self, engine_output: RequestOutput):
7171
engine_output.metrics.time_in_queue)
7272

7373
def count_request_failure(self, reason: FailureReasonLabel):
74-
self.tgi_request_failure.labels({"err": reason}).inc(1)
74+
self.tgi_request_failure.labels(err=reason).inc(1)
7575

7676

7777
class TGISStatLogger(StatLogger):
@@ -118,17 +118,13 @@ def log(self, stats: Stats) -> None:
118118
# Then log TGIS specific ones
119119
self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys)
120120
self.tgi_batch_current_size.set(stats.num_running_sys)
121-
self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys)
122-
self.tgi_batch_current_size.set(stats.num_running_sys)
123121

124122
for ttft in stats.time_to_first_tokens_iter:
125123
self.tgi_batch_inference_duration.labels(
126-
{"method": "prefill"}
127-
).observe(ttft)
124+
method="prefill").observe(ttft)
128125
for tpot in stats.time_per_output_tokens_iter:
129126
self.tgi_batch_inference_duration.labels(
130-
{"method": "next_token"}
131-
).observe(tpot)
127+
method="next_token").observe(tpot)
132128

133129
for input_len in stats.num_prompt_tokens_requests:
134130
self.tgi_request_input_length.observe(input_len)

0 commit comments

Comments
 (0)