Merge remote-tracking branch 'ibm-vllm/main' into ibm_main_update_05162022

z103cb · z103cb · commit a72d13a10b0a · 2024-05-16T12:17:50.000+03:00
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -206,7 +206,7 @@ COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    python setup.py bdist_wheel --dist-dir=dist
+    VLLM_USE_PRECOMPILED=1 python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### FLASH_ATTENTION Build IMAGE ####################
 FROM dev as flash-attn-builder
@@ -258,7 +258,7 @@ RUN pip install \
         --no-binary="all" \
         --no-cache-dir \
         "vllm-nccl-cu12==2.18.1.0.4.0" && \
-    mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/
+    mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/
 
 # Install flash attention (from pre-built wheel)
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
@@ -277,7 +277,7 @@ ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     GRPC_PORT=8033 \
     HOME=/home/vllm \
-    VLLM_NCCL_SO_PATH=/opt/vllm/libnccl.so.2.18.1 \
+    VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \
     VLLM_USAGE_SOURCE=production-docker-image
 
 # setup non-root user for OpenShift
diff --git a/vllm/tgis_utils/metrics.py b/vllm/tgis_utils/metrics.py
@@ -71,7 +71,7 @@ def observe_queue_time(self, engine_output: RequestOutput):
             engine_output.metrics.time_in_queue)
 
     def count_request_failure(self, reason: FailureReasonLabel):
-        self.tgi_request_failure.labels({"err": reason}).inc(1)
+        self.tgi_request_failure.labels(err=reason).inc(1)
 
 
 class TGISStatLogger(StatLogger):
@@ -118,17 +118,13 @@ def log(self, stats: Stats) -> None:
         # Then log TGIS specific ones
         self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys)
         self.tgi_batch_current_size.set(stats.num_running_sys)
-        self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys)
-        self.tgi_batch_current_size.set(stats.num_running_sys)
 
         for ttft in stats.time_to_first_tokens_iter:
             self.tgi_batch_inference_duration.labels(
-                {"method": "prefill"}
-            ).observe(ttft)
+                method="prefill").observe(ttft)
         for tpot in stats.time_per_output_tokens_iter:
             self.tgi_batch_inference_duration.labels(
-                {"method": "next_token"}
-            ).observe(tpot)
+                method="next_token").observe(tpot)
 
         for input_len in stats.num_prompt_tokens_requests:
             self.tgi_request_input_length.observe(input_len)